]> git.pld-linux.org Git - packages/kernel.git/blob - xen-3.0-2.6.16.patch
- ppc configs updated, rel 0.7
[packages/kernel.git] / xen-3.0-2.6.16.patch
1 diff -Nurp ref-linux-2.6.16.9/arch/i386/boot-xen/Makefile tmp-linux-2.6-xen.patch/arch/i386/boot-xen/Makefile
2 --- ref-linux-2.6.16.9/arch/i386/boot-xen/Makefile      1970-01-01 01:00:00.000000000 +0100
3 +++ tmp-linux-2.6-xen.patch/arch/i386/boot-xen/Makefile 2006-04-10 00:05:52.000000000 +0200
4 @@ -0,0 +1,21 @@
5 +
6 +OBJCOPYFLAGS := -g --strip-unneeded
7 +
8 +vmlinuz: vmlinux-stripped FORCE
9 +       $(call if_changed,gzip)
10 +
11 +vmlinux-stripped: vmlinux FORCE
12 +       $(call if_changed,objcopy)
13 +
14 +INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH))
15 +
16 +XINSTALL_NAME ?= $(KERNELRELEASE)
17 +install:
18 +       mkdir -p $(INSTALL_ROOT)/boot
19 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
20 +       rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
21 +       install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
22 +       install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
23 +       install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
24 +       install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
25 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
26 diff -Nurp ref-linux-2.6.16.9/arch/i386/Kconfig tmp-linux-2.6-xen.patch/arch/i386/Kconfig
27 --- ref-linux-2.6.16.9/arch/i386/Kconfig        2006-04-19 08:10:14.000000000 +0200
28 +++ tmp-linux-2.6-xen.patch/arch/i386/Kconfig   2006-04-10 00:05:52.000000000 +0200
29 @@ -58,6 +58,15 @@ config X86_PC
30         help
31           Choose this option if your computer is a standard PC or compatible.
32  
33 +config X86_XEN
34 +       bool "Xen-compatible"
35 +       select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
36 +       select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
37 +       select SWIOTLB
38 +       help
39 +         Choose this option if you plan to run this kernel on top of the
40 +         Xen Hypervisor.
41 +
42  config X86_ELAN
43         bool "AMD Elan"
44         help
45 @@ -159,6 +168,7 @@ source "arch/i386/Kconfig.cpu"
46  
47  config HPET_TIMER
48         bool "HPET Timer Support"
49 +       depends on !X86_XEN
50         help
51           This enables the use of the HPET for the kernel's internal timer.
52           HPET is the next generation timer replacing legacy 8254s.
53 @@ -202,6 +212,19 @@ config SMP
54  
55           If you don't know what to do here, say N.
56  
57 +config SMP_ALTERNATIVES
58 +       bool "SMP alternatives support (EXPERIMENTAL)"
59 +       depends on SMP && EXPERIMENTAL
60 +       help
61 +         Try to reduce the overhead of running an SMP kernel on a uniprocessor
62 +         host slightly by replacing certain key instruction sequences
63 +         according to whether we currently have more than one CPU available.
64 +         This should provide a noticeable boost to performance when
65 +         running SMP kernels on UP machines, and have negligible impact
66 +         when running on an true SMP host.
67 +
68 +          If unsure, say N.
69 +         
70  config NR_CPUS
71         int "Maximum number of CPUs (2-255)"
72         range 2 255
73 @@ -218,7 +241,7 @@ config NR_CPUS
74  
75  config SCHED_SMT
76         bool "SMT (Hyperthreading) scheduler support"
77 -       depends on SMP
78 +       depends on SMP && !X86_XEN
79         default off
80         help
81           SMT scheduler support improves the CPU scheduler's decision making
82 @@ -230,7 +253,7 @@ source "kernel/Kconfig.preempt"
83  
84  config X86_UP_APIC
85         bool "Local APIC support on uniprocessors"
86 -       depends on !SMP && !(X86_VISWS || X86_VOYAGER)
87 +       depends on !SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
88         help
89           A local APIC (Advanced Programmable Interrupt Controller) is an
90           integrated interrupt controller in the CPU. If you have a single-CPU
91 @@ -255,12 +278,12 @@ config X86_UP_IOAPIC
92  
93  config X86_LOCAL_APIC
94         bool
95 -       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
96 +       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
97         default y
98  
99  config X86_IO_APIC
100         bool
101 -       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
102 +       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
103         default y
104  
105  config X86_VISWS_APIC
106 @@ -268,9 +291,14 @@ config X86_VISWS_APIC
107         depends on X86_VISWS
108         default y
109  
110 +config X86_TSC
111 +       bool
112 +       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ && !X86_XEN
113 +       default y
114 +
115  config X86_MCE
116         bool "Machine Check Exception"
117 -       depends on !X86_VOYAGER
118 +       depends on !(X86_VOYAGER || X86_XEN)
119         ---help---
120           Machine Check Exception support allows the processor to notify the
121           kernel if it detects a problem (e.g. overheating, component failure).
122 @@ -360,6 +388,7 @@ config X86_REBOOTFIXUPS
123  
124  config MICROCODE
125         tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
126 +       depends on !XEN_UNPRIVILEGED_GUEST
127         ---help---
128           If you say Y here and also to "/dev file system support" in the
129           'File systems' section, you will be able to update the microcode on
130 @@ -377,6 +406,7 @@ config MICROCODE
131  
132  config X86_MSR
133         tristate "/dev/cpu/*/msr - Model-specific register support"
134 +       depends on !X86_XEN
135         help
136           This device gives privileged processes access to the x86
137           Model-Specific Registers (MSRs).  It is a character device with
138 @@ -392,6 +422,10 @@ config X86_CPUID
139           with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
140           /dev/cpu/31/cpuid.
141  
142 +config SWIOTLB
143 +       bool
144 +       default n
145 +
146  source "drivers/firmware/Kconfig"
147  
148  choice
149 @@ -560,7 +594,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
150  
151  config HIGHPTE
152         bool "Allocate 3rd-level pagetables from highmem"
153 -       depends on HIGHMEM4G || HIGHMEM64G
154 +       depends on (HIGHMEM4G || HIGHMEM64G) && !X86_XEN
155         help
156           The VM uses one page table entry for each page of physical memory.
157           For systems with a lot of RAM, this can be wasteful of precious
158 @@ -569,6 +603,7 @@ config HIGHPTE
159  
160  config MATH_EMULATION
161         bool "Math emulation"
162 +       depends on !X86_XEN
163         ---help---
164           Linux can emulate a math coprocessor (used for floating point
165           operations) if you don't have one. 486DX and Pentium processors have
166 @@ -594,6 +629,8 @@ config MATH_EMULATION
167  
168  config MTRR
169         bool "MTRR (Memory Type Range Register) support"
170 +       depends on !XEN_UNPRIVILEGED_GUEST
171 +       default y if X86_XEN
172         ---help---
173           On Intel P6 family processors (Pentium Pro, Pentium II and later)
174           the Memory Type Range Registers (MTRRs) may be used to control
175 @@ -628,7 +665,7 @@ config MTRR
176  
177  config EFI
178         bool "Boot from EFI support (EXPERIMENTAL)"
179 -       depends on ACPI
180 +       depends on ACPI && !X86_XEN
181         default n
182         ---help---
183         This enables the the kernel to boot on EFI platforms using
184 @@ -646,7 +683,7 @@ config EFI
185  
186  config IRQBALANCE
187         bool "Enable kernel irq balancing"
188 -       depends on SMP && X86_IO_APIC
189 +       depends on SMP && X86_IO_APIC && !X86_XEN
190         default y
191         help
192           The default yes will allow the kernel to do irq load balancing.
193 @@ -689,7 +726,7 @@ source kernel/Kconfig.hz
194  
195  config KEXEC
196         bool "kexec system call (EXPERIMENTAL)"
197 -       depends on EXPERIMENTAL
198 +       depends on EXPERIMENTAL && !X86_XEN
199         help
200           kexec is a system call that implements the ability to shutdown your
201           current kernel, and to start another kernel.  It is like a reboot
202 @@ -743,6 +780,7 @@ config HOTPLUG_CPU
203  config DOUBLEFAULT
204         default y
205         bool "Enable doublefault exception handler" if EMBEDDED
206 +       depends on !X86_NO_TSS
207         help
208            This option allows trapping of rare doublefault exceptions that
209            would otherwise cause a system to silently reboot. Disabling this
210 @@ -753,18 +791,20 @@ endmenu
211  
212  
213  menu "Power management options (ACPI, APM)"
214 -       depends on !X86_VOYAGER
215 +       depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
216  
217 +if !X86_XEN
218  source kernel/power/Kconfig
219 +endif
220  
221  source "drivers/acpi/Kconfig"
222  
223  menu "APM (Advanced Power Management) BIOS Support"
224 -depends on PM && !X86_VISWS
225 +depends on PM && !(X86_VISWS || X86_XEN)
226  
227  config APM
228         tristate "APM (Advanced Power Management) BIOS support"
229 -       depends on PM
230 +       depends on PM && PM_LEGACY
231         ---help---
232           APM is a BIOS specification for saving power using several different
233           techniques. This is mostly useful for battery powered laptops with
234 @@ -949,6 +989,7 @@ choice
235  
236  config PCI_GOBIOS
237         bool "BIOS"
238 +       depends on !X86_XEN
239  
240  config PCI_GOMMCONFIG
241         bool "MMConfig"
242 @@ -956,6 +997,13 @@ config PCI_GOMMCONFIG
243  config PCI_GODIRECT
244         bool "Direct"
245  
246 +config PCI_GOXEN_FE
247 +       bool "Xen PCI Frontend"
248 +       depends on X86_XEN
249 +       help
250 +         The PCI device frontend driver allows the kernel to import arbitrary
251 +         PCI devices from a PCI backend to support PCI driver domains.
252 +
253  config PCI_GOANY
254         bool "Any"
255  
256 @@ -963,7 +1011,7 @@ endchoice
257  
258  config PCI_BIOS
259         bool
260 -       depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
261 +       depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY)
262         default y
263  
264  config PCI_DIRECT
265 @@ -976,6 +1024,18 @@ config PCI_MMCONFIG
266         depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
267         default y
268  
269 +config XEN_PCIDEV_FRONTEND
270 +       bool
271 +       depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY)
272 +       default y
273 +
274 +config XEN_PCIDEV_FE_DEBUG
275 +       bool "Xen PCI Frontend Debugging"
276 +       depends on XEN_PCIDEV_FRONTEND
277 +       default n
278 +       help
279 +         Enables some debug statements within the PCI Frontend.
280 +
281  source "drivers/pci/pcie/Kconfig"
282  
283  source "drivers/pci/Kconfig"
284 @@ -986,7 +1046,7 @@ config ISA_DMA_API
285  
286  config ISA
287         bool "ISA support"
288 -       depends on !(X86_VOYAGER || X86_VISWS)
289 +       depends on !(X86_VOYAGER || X86_VISWS || X86_XEN)
290         help
291           Find out whether you have ISA slots on your motherboard.  ISA is the
292           name of a bus system, i.e. the way the CPU talks to the other stuff
293 @@ -1013,7 +1073,7 @@ config EISA
294  source "drivers/eisa/Kconfig"
295  
296  config MCA
297 -       bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
298 +       bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN)
299         default y if X86_VOYAGER
300         help
301           MicroChannel Architecture is found in some IBM PS/2 machines and
302 @@ -1056,7 +1116,9 @@ source "fs/Kconfig"
303  menu "Instrumentation Support"
304         depends on EXPERIMENTAL
305  
306 +if !X86_XEN
307  source "arch/i386/oprofile/Kconfig"
308 +endif
309  
310  config KPROBES
311         bool "Kprobes (EXPERIMENTAL)"
312 @@ -1075,6 +1137,8 @@ source "security/Kconfig"
313  
314  source "crypto/Kconfig"
315  
316 +source "drivers/xen/Kconfig"
317 +
318  source "lib/Kconfig"
319  
320  #
321 @@ -1100,7 +1164,7 @@ config X86_SMP
322  
323  config X86_HT
324         bool
325 -       depends on SMP && !(X86_VISWS || X86_VOYAGER)
326 +       depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN)
327         default y
328  
329  config X86_BIOS_REBOOT
330 @@ -1113,6 +1177,21 @@ config X86_TRAMPOLINE
331         depends on X86_SMP || (X86_VOYAGER && SMP)
332         default y
333  
334 +config X86_NO_TSS
335 +       bool
336 +       depends on X86_XEN
337 +       default y
338 +
339 +config X86_SYSENTER
340 +       bool
341 +       depends on !X86_NO_TSS
342 +       default y
343 +
344 +config X86_NO_IDT
345 +       bool
346 +       depends on X86_XEN
347 +       default y
348 +
349  config KTIME_SCALAR
350         bool
351         default y
352 diff -Nurp ref-linux-2.6.16.9/arch/i386/Kconfig.cpu tmp-linux-2.6-xen.patch/arch/i386/Kconfig.cpu
353 --- ref-linux-2.6.16.9/arch/i386/Kconfig.cpu    2006-04-19 08:10:14.000000000 +0200
354 +++ tmp-linux-2.6-xen.patch/arch/i386/Kconfig.cpu       2006-04-10 00:05:52.000000000 +0200
355 @@ -251,7 +251,7 @@ config X86_PPRO_FENCE
356  
357  config X86_F00F_BUG
358         bool
359 -       depends on M586MMX || M586TSC || M586 || M486 || M386
360 +       depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT
361         default y
362  
363  config X86_WP_WORKS_OK
364 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/acpi/boot-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/acpi/boot-xen.c
365 --- ref-linux-2.6.16.9/arch/i386/kernel/acpi/boot-xen.c 1970-01-01 01:00:00.000000000 +0100
366 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/acpi/boot-xen.c    2006-04-10 00:05:52.000000000 +0200
367 @@ -0,0 +1,1161 @@
368 +/*
369 + *  boot.c - Architecture-Specific Low-Level ACPI Boot Support
370 + *
371 + *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
372 + *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
373 + *
374 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
375 + *
376 + *  This program is free software; you can redistribute it and/or modify
377 + *  it under the terms of the GNU General Public License as published by
378 + *  the Free Software Foundation; either version 2 of the License, or
379 + *  (at your option) any later version.
380 + *
381 + *  This program is distributed in the hope that it will be useful,
382 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
383 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
384 + *  GNU General Public License for more details.
385 + *
386 + *  You should have received a copy of the GNU General Public License
387 + *  along with this program; if not, write to the Free Software
388 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
389 + *
390 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
391 + */
392 +
393 +#include <linux/init.h>
394 +#include <linux/config.h>
395 +#include <linux/acpi.h>
396 +#include <linux/efi.h>
397 +#include <linux/module.h>
398 +#include <linux/dmi.h>
399 +#include <linux/irq.h>
400 +
401 +#include <asm/pgtable.h>
402 +#include <asm/io_apic.h>
403 +#include <asm/apic.h>
404 +#include <asm/io.h>
405 +#include <asm/mpspec.h>
406 +
407 +#ifdef CONFIG_X86_64
408 +
409 +extern void __init clustered_apic_check(void);
410 +
411 +extern int gsi_irq_sharing(int gsi);
412 +#include <asm/proto.h>
413 +
414 +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
415 +
416 +
417 +#else                          /* X86 */
418 +
419 +#ifdef CONFIG_X86_LOCAL_APIC
420 +#include <mach_apic.h>
421 +#include <mach_mpparse.h>
422 +#endif                         /* CONFIG_X86_LOCAL_APIC */
423 +
424 +static inline int gsi_irq_sharing(int gsi) { return gsi; }
425 +
426 +#endif                         /* X86 */
427 +
428 +#define BAD_MADT_ENTRY(entry, end) (                                       \
429 +               (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
430 +               ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
431 +
432 +#define PREFIX                 "ACPI: "
433 +
434 +int acpi_noirq __initdata;     /* skip ACPI IRQ initialization */
435 +int acpi_pci_disabled __initdata;      /* skip ACPI PCI scan and IRQ initialization */
436 +int acpi_ht __initdata = 1;    /* enable HT */
437 +
438 +int acpi_lapic;
439 +int acpi_ioapic;
440 +int acpi_strict;
441 +EXPORT_SYMBOL(acpi_strict);
442 +
443 +acpi_interrupt_flags acpi_sci_flags __initdata;
444 +int acpi_sci_override_gsi __initdata;
445 +int acpi_skip_timer_override __initdata;
446 +
447 +#ifdef CONFIG_X86_LOCAL_APIC
448 +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
449 +#endif
450 +
451 +#ifndef __HAVE_ARCH_CMPXCHG
452 +#warning ACPI uses CMPXCHG, i486 and later hardware
453 +#endif
454 +
455 +#define MAX_MADT_ENTRIES       256
456 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
457 +    {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
458 +EXPORT_SYMBOL(x86_acpiid_to_apicid);
459 +
460 +/* --------------------------------------------------------------------------
461 +                              Boot-time Configuration
462 +   -------------------------------------------------------------------------- */
463 +
464 +/*
465 + * The default interrupt routing model is PIC (8259).  This gets
466 + * overriden if IOAPICs are enumerated (below).
467 + */
468 +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
469 +
470 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
471 +
472 +/* rely on all ACPI tables being in the direct mapping */
473 +char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
474 +{
475 +       if (!phys_addr || !size)
476 +               return NULL;
477 +
478 +       if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
479 +               return __va(phys_addr);
480 +
481 +       return NULL;
482 +}
483 +
484 +#else
485 +
486 +/*
487 + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
488 + * to map the target physical address. The problem is that set_fixmap()
489 + * provides a single page, and it is possible that the page is not
490 + * sufficient.
491 + * By using this area, we can map up to MAX_IO_APICS pages temporarily,
492 + * i.e. until the next __va_range() call.
493 + *
494 + * Important Safety Note:  The fixed I/O APIC page numbers are *subtracted*
495 + * from the fixed base.  That's why we start at FIX_IO_APIC_BASE_END and
496 + * count idx down while incrementing the phys address.
497 + */
498 +char *__acpi_map_table(unsigned long phys, unsigned long size)
499 +{
500 +       unsigned long base, offset, mapped_size;
501 +       int idx;
502 +
503 +#ifndef CONFIG_XEN
504 +       if (phys + size < 8 * 1024 * 1024)
505 +               return __va(phys);
506 +#endif
507 +
508 +       offset = phys & (PAGE_SIZE - 1);
509 +       mapped_size = PAGE_SIZE - offset;
510 +       set_fixmap(FIX_ACPI_END, phys);
511 +       base = fix_to_virt(FIX_ACPI_END);
512 +
513 +       /*
514 +        * Most cases can be covered by the below.
515 +        */
516 +       idx = FIX_ACPI_END;
517 +       while (mapped_size < size) {
518 +               if (--idx < FIX_ACPI_BEGIN)
519 +                       return NULL;    /* cannot handle this */
520 +               phys += PAGE_SIZE;
521 +               set_fixmap(idx, phys);
522 +               mapped_size += PAGE_SIZE;
523 +       }
524 +
525 +       return ((unsigned char *)base + offset);
526 +}
527 +#endif
528 +
529 +#ifdef CONFIG_PCI_MMCONFIG
530 +/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
531 +struct acpi_table_mcfg_config *pci_mmcfg_config;
532 +int pci_mmcfg_config_num;
533 +
534 +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
535 +{
536 +       struct acpi_table_mcfg *mcfg;
537 +       unsigned long i;
538 +       int config_size;
539 +
540 +       if (!phys_addr || !size)
541 +               return -EINVAL;
542 +
543 +       mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
544 +       if (!mcfg) {
545 +               printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
546 +               return -ENODEV;
547 +       }
548 +
549 +       /* how many config structures do we have */
550 +       pci_mmcfg_config_num = 0;
551 +       i = size - sizeof(struct acpi_table_mcfg);
552 +       while (i >= sizeof(struct acpi_table_mcfg_config)) {
553 +               ++pci_mmcfg_config_num;
554 +               i -= sizeof(struct acpi_table_mcfg_config);
555 +       };
556 +       if (pci_mmcfg_config_num == 0) {
557 +               printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
558 +               return -ENODEV;
559 +       }
560 +
561 +       config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
562 +       pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
563 +       if (!pci_mmcfg_config) {
564 +               printk(KERN_WARNING PREFIX
565 +                      "No memory for MCFG config tables\n");
566 +               return -ENOMEM;
567 +       }
568 +
569 +       memcpy(pci_mmcfg_config, &mcfg->config, config_size);
570 +       for (i = 0; i < pci_mmcfg_config_num; ++i) {
571 +               if (mcfg->config[i].base_reserved) {
572 +                       printk(KERN_ERR PREFIX
573 +                              "MMCONFIG not in low 4GB of memory\n");
574 +                       return -ENODEV;
575 +               }
576 +       }
577 +
578 +       return 0;
579 +}
580 +#endif                         /* CONFIG_PCI_MMCONFIG */
581 +
582 +#ifdef CONFIG_X86_LOCAL_APIC
583 +static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
584 +{
585 +       struct acpi_table_madt *madt = NULL;
586 +
587 +       if (!phys_addr || !size)
588 +               return -EINVAL;
589 +
590 +       madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
591 +       if (!madt) {
592 +               printk(KERN_WARNING PREFIX "Unable to map MADT\n");
593 +               return -ENODEV;
594 +       }
595 +
596 +       if (madt->lapic_address) {
597 +               acpi_lapic_addr = (u64) madt->lapic_address;
598 +
599 +               printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
600 +                      madt->lapic_address);
601 +       }
602 +
603 +       acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
604 +
605 +       return 0;
606 +}
607 +
608 +static int __init
609 +acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
610 +{
611 +       struct acpi_table_lapic *processor = NULL;
612 +
613 +       processor = (struct acpi_table_lapic *)header;
614 +
615 +       if (BAD_MADT_ENTRY(processor, end))
616 +               return -EINVAL;
617 +
618 +       acpi_table_print_madt_entry(header);
619 +
620 +       /* Record local apic id only when enabled */
621 +       if (processor->flags.enabled)
622 +               x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
623 +
624 +       /*
625 +        * We need to register disabled CPU as well to permit
626 +        * counting disabled CPUs. This allows us to size
627 +        * cpus_possible_map more accurately, to permit
628 +        * to not preallocating memory for all NR_CPUS
629 +        * when we use CPU hotplug.
630 +        */
631 +       mp_register_lapic(processor->id,        /* APIC ID */
632 +                         processor->flags.enabled);    /* Enabled? */
633 +
634 +       return 0;
635 +}
636 +
637 +static int __init
638 +acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
639 +                         const unsigned long end)
640 +{
641 +       struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
642 +
643 +       lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
644 +
645 +       if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
646 +               return -EINVAL;
647 +
648 +       acpi_lapic_addr = lapic_addr_ovr->address;
649 +
650 +       return 0;
651 +}
652 +
653 +static int __init
654 +acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
655 +{
656 +       struct acpi_table_lapic_nmi *lapic_nmi = NULL;
657 +
658 +       lapic_nmi = (struct acpi_table_lapic_nmi *)header;
659 +
660 +       if (BAD_MADT_ENTRY(lapic_nmi, end))
661 +               return -EINVAL;
662 +
663 +       acpi_table_print_madt_entry(header);
664 +
665 +       if (lapic_nmi->lint != 1)
666 +               printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
667 +
668 +       return 0;
669 +}
670 +
671 +#endif                         /*CONFIG_X86_LOCAL_APIC */
672 +
673 +#ifdef CONFIG_X86_IO_APIC
674 +
675 +static int __init
676 +acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
677 +{
678 +       struct acpi_table_ioapic *ioapic = NULL;
679 +
680 +       ioapic = (struct acpi_table_ioapic *)header;
681 +
682 +       if (BAD_MADT_ENTRY(ioapic, end))
683 +               return -EINVAL;
684 +
685 +       acpi_table_print_madt_entry(header);
686 +
687 +       mp_register_ioapic(ioapic->id,
688 +                          ioapic->address, ioapic->global_irq_base);
689 +
690 +       return 0;
691 +}
692 +
693 +/*
694 + * Parse Interrupt Source Override for the ACPI SCI
695 + */
696 +static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
697 +{
698 +       if (trigger == 0)       /* compatible SCI trigger is level */
699 +               trigger = 3;
700 +
701 +       if (polarity == 0)      /* compatible SCI polarity is low */
702 +               polarity = 3;
703 +
704 +       /* Command-line over-ride via acpi_sci= */
705 +       if (acpi_sci_flags.trigger)
706 +               trigger = acpi_sci_flags.trigger;
707 +
708 +       if (acpi_sci_flags.polarity)
709 +               polarity = acpi_sci_flags.polarity;
710 +
711 +       /*
712 +        * mp_config_acpi_legacy_irqs() already setup IRQs < 16
713 +        * If GSI is < 16, this will update its flags,
714 +        * else it will create a new mp_irqs[] entry.
715 +        */
716 +       mp_override_legacy_irq(gsi, polarity, trigger, gsi);
717 +
718 +       /*
719 +        * stash over-ride to indicate we've been here
720 +        * and for later update of acpi_fadt
721 +        */
722 +       acpi_sci_override_gsi = gsi;
723 +       return;
724 +}
725 +
726 +static int __init
727 +acpi_parse_int_src_ovr(acpi_table_entry_header * header,
728 +                      const unsigned long end)
729 +{
730 +       struct acpi_table_int_src_ovr *intsrc = NULL;
731 +
732 +       intsrc = (struct acpi_table_int_src_ovr *)header;
733 +
734 +       if (BAD_MADT_ENTRY(intsrc, end))
735 +               return -EINVAL;
736 +
737 +       acpi_table_print_madt_entry(header);
738 +
739 +       if (intsrc->bus_irq == acpi_fadt.sci_int) {
740 +               acpi_sci_ioapic_setup(intsrc->global_irq,
741 +                                     intsrc->flags.polarity,
742 +                                     intsrc->flags.trigger);
743 +               return 0;
744 +       }
745 +
746 +       if (acpi_skip_timer_override &&
747 +           intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
748 +               printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
749 +               return 0;
750 +       }
751 +
752 +       mp_override_legacy_irq(intsrc->bus_irq,
753 +                              intsrc->flags.polarity,
754 +                              intsrc->flags.trigger, intsrc->global_irq);
755 +
756 +       return 0;
757 +}
758 +
759 +static int __init
760 +acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
761 +{
762 +       struct acpi_table_nmi_src *nmi_src = NULL;
763 +
764 +       nmi_src = (struct acpi_table_nmi_src *)header;
765 +
766 +       if (BAD_MADT_ENTRY(nmi_src, end))
767 +               return -EINVAL;
768 +
769 +       acpi_table_print_madt_entry(header);
770 +
771 +       /* TBD: Support nimsrc entries? */
772 +
773 +       return 0;
774 +}
775 +
776 +#endif                         /* CONFIG_X86_IO_APIC */
777 +
778 +/*
779 + * acpi_pic_sci_set_trigger()
780 + * 
781 + * use ELCR to set PIC-mode trigger type for SCI
782 + *
783 + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
784 + * it may require Edge Trigger -- use "acpi_sci=edge"
785 + *
786 + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
787 + * for the 8259 PIC.  bit[n] = 1 means irq[n] is Level, otherwise Edge.
788 + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
789 + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
790 + */
791 +
792 +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
793 +{
794 +       unsigned int mask = 1 << irq;
795 +       unsigned int old, new;
796 +
797 +       /* Real old ELCR mask */
798 +       old = inb(0x4d0) | (inb(0x4d1) << 8);
799 +
800 +       /*
801 +        * If we use ACPI to set PCI irq's, then we should clear ELCR
802 +        * since we will set it correctly as we enable the PCI irq
803 +        * routing.
804 +        */
805 +       new = acpi_noirq ? old : 0;
806 +
807 +       /*
808 +        * Update SCI information in the ELCR, it isn't in the PCI
809 +        * routing tables..
810 +        */
811 +       switch (trigger) {
812 +       case 1:         /* Edge - clear */
813 +               new &= ~mask;
814 +               break;
815 +       case 3:         /* Level - set */
816 +               new |= mask;
817 +               break;
818 +       }
819 +
820 +       if (old == new)
821 +               return;
822 +
823 +       printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
824 +       outb(new, 0x4d0);
825 +       outb(new >> 8, 0x4d1);
826 +}
827 +
828 +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
829 +{
830 +#ifdef CONFIG_X86_IO_APIC
831 +       if (use_pci_vector() && !platform_legacy_irq(gsi))
832 +               *irq = IO_APIC_VECTOR(gsi);
833 +       else
834 +#endif
835 +               *irq = gsi_irq_sharing(gsi);
836 +       return 0;
837 +}
838 +
839 +/*
840 + * success: return IRQ number (>=0)
841 + * failure: return < 0
842 + */
843 +int acpi_register_gsi(u32 gsi, int triggering, int polarity)
844 +{
845 +       unsigned int irq;
846 +       unsigned int plat_gsi = gsi;
847 +
848 +#ifdef CONFIG_PCI
849 +       /*
850 +        * Make sure all (legacy) PCI IRQs are set as level-triggered.
851 +        */
852 +       if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
853 +               extern void eisa_set_level_irq(unsigned int irq);
854 +
855 +               if (triggering == ACPI_LEVEL_SENSITIVE)
856 +                       eisa_set_level_irq(gsi);
857 +       }
858 +#endif
859 +
860 +#ifdef CONFIG_X86_IO_APIC
861 +       if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
862 +               plat_gsi = mp_register_gsi(gsi, triggering, polarity);
863 +       }
864 +#endif
865 +       acpi_gsi_to_irq(plat_gsi, &irq);
866 +       return irq;
867 +}
868 +
869 +EXPORT_SYMBOL(acpi_register_gsi);
870 +
871 +/*
872 + *  ACPI based hotplug support for CPU
873 + */
874 +#ifdef CONFIG_ACPI_HOTPLUG_CPU
875 +int acpi_map_lsapic(acpi_handle handle, int *pcpu)
876 +{
877 +       /* TBD */
878 +       return -EINVAL;
879 +}
880 +
881 +EXPORT_SYMBOL(acpi_map_lsapic);
882 +
883 +int acpi_unmap_lsapic(int cpu)
884 +{
885 +       /* TBD */
886 +       return -EINVAL;
887 +}
888 +
889 +EXPORT_SYMBOL(acpi_unmap_lsapic);
890 +#endif                         /* CONFIG_ACPI_HOTPLUG_CPU */
891 +
892 +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
893 +{
894 +       /* TBD */
895 +       return -EINVAL;
896 +}
897 +
898 +EXPORT_SYMBOL(acpi_register_ioapic);
899 +
900 +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
901 +{
902 +       /* TBD */
903 +       return -EINVAL;
904 +}
905 +
906 +EXPORT_SYMBOL(acpi_unregister_ioapic);
907 +
908 +static unsigned long __init
909 +acpi_scan_rsdp(unsigned long start, unsigned long length)
910 +{
911 +       unsigned long offset = 0;
912 +       unsigned long sig_len = sizeof("RSD PTR ") - 1;
913 +       unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
914 +
915 +       /*
916 +        * Scan all 16-byte boundaries of the physical memory region for the
917 +        * RSDP signature.
918 +        */
919 +       for (offset = 0; offset < length; offset += 16) {
920 +               if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len))
921 +                       continue;
922 +               return (start + offset);
923 +       }
924 +
925 +       return 0;
926 +}
927 +
928 +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
929 +{
930 +       struct acpi_table_sbf *sb;
931 +
932 +       if (!phys_addr || !size)
933 +               return -EINVAL;
934 +
935 +       sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
936 +       if (!sb) {
937 +               printk(KERN_WARNING PREFIX "Unable to map SBF\n");
938 +               return -ENODEV;
939 +       }
940 +
941 +       sbf_port = sb->sbf_cmos;        /* Save CMOS port */
942 +
943 +       return 0;
944 +}
945 +
946 +#ifdef CONFIG_HPET_TIMER
947 +
948 +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
949 +{
950 +       struct acpi_table_hpet *hpet_tbl;
951 +
952 +       if (!phys || !size)
953 +               return -EINVAL;
954 +
955 +       hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
956 +       if (!hpet_tbl) {
957 +               printk(KERN_WARNING PREFIX "Unable to map HPET\n");
958 +               return -ENODEV;
959 +       }
960 +
961 +       if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
962 +               printk(KERN_WARNING PREFIX "HPET timers must be located in "
963 +                      "memory.\n");
964 +               return -1;
965 +       }
966 +#ifdef CONFIG_X86_64
967 +       vxtime.hpet_address = hpet_tbl->addr.addrl |
968 +           ((long)hpet_tbl->addr.addrh << 32);
969 +
970 +       printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
971 +              hpet_tbl->id, vxtime.hpet_address);
972 +#else                          /* X86 */
973 +       {
974 +               extern unsigned long hpet_address;
975 +
976 +               hpet_address = hpet_tbl->addr.addrl;
977 +               printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
978 +                      hpet_tbl->id, hpet_address);
979 +       }
980 +#endif                         /* X86 */
981 +
982 +       return 0;
983 +}
984 +#else
985 +#define        acpi_parse_hpet NULL
986 +#endif
987 +
988 +#ifdef CONFIG_X86_PM_TIMER
989 +extern u32 pmtmr_ioport;
990 +#endif
991 +
992 +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
993 +{
994 +       struct fadt_descriptor_rev2 *fadt = NULL;
995 +
996 +       fadt = (struct fadt_descriptor_rev2 *)__acpi_map_table(phys, size);
997 +       if (!fadt) {
998 +               printk(KERN_WARNING PREFIX "Unable to map FADT\n");
999 +               return 0;
1000 +       }
1001 +       /* initialize sci_int early for INT_SRC_OVR MADT parsing */
1002 +       acpi_fadt.sci_int = fadt->sci_int;
1003 +
1004 +       /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
1005 +       acpi_fadt.revision = fadt->revision;
1006 +       acpi_fadt.force_apic_physical_destination_mode =
1007 +           fadt->force_apic_physical_destination_mode;
1008 +
1009 +#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN)
1010 +       /* detect the location of the ACPI PM Timer */
1011 +       if (fadt->revision >= FADT2_REVISION_ID) {
1012 +               /* FADT rev. 2 */
1013 +               if (fadt->xpm_tmr_blk.address_space_id !=
1014 +                   ACPI_ADR_SPACE_SYSTEM_IO)
1015 +                       return 0;
1016 +
1017 +               pmtmr_ioport = fadt->xpm_tmr_blk.address;
1018 +               /*
1019 +                * "X" fields are optional extensions to the original V1.0
1020 +                * fields, so we must selectively expand V1.0 fields if the
1021 +                * corresponding X field is zero.
1022 +                */
1023 +               if (!pmtmr_ioport)
1024 +                       pmtmr_ioport = fadt->V1_pm_tmr_blk;
1025 +       } else {
1026 +               /* FADT rev. 1 */
1027 +               pmtmr_ioport = fadt->V1_pm_tmr_blk;
1028 +       }
1029 +       if (pmtmr_ioport)
1030 +               printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
1031 +                      pmtmr_ioport);
1032 +#endif
1033 +       return 0;
1034 +}
1035 +
1036 +unsigned long __init acpi_find_rsdp(void)
1037 +{
1038 +       unsigned long rsdp_phys = 0;
1039 +
1040 +       if (efi_enabled) {
1041 +               if (efi.acpi20)
1042 +                       return __pa(efi.acpi20);
1043 +               else if (efi.acpi)
1044 +                       return __pa(efi.acpi);
1045 +       }
1046 +       /*
1047 +        * Scan memory looking for the RSDP signature. First search EBDA (low
1048 +        * memory) paragraphs and then search upper memory (E0000-FFFFF).
1049 +        */
1050 +       rsdp_phys = acpi_scan_rsdp(0, 0x400);
1051 +       if (!rsdp_phys)
1052 +               rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
1053 +
1054 +       return rsdp_phys;
1055 +}
1056 +
1057 +#ifdef CONFIG_X86_LOCAL_APIC
1058 +/*
1059 + * Parse LAPIC entries in MADT
1060 + * returns 0 on success, < 0 on error
1061 + */
1062 +static int __init acpi_parse_madt_lapic_entries(void)
1063 +{
1064 +       int count;
1065 +
1066 +       /* 
1067 +        * Note that the LAPIC address is obtained from the MADT (32-bit value)
1068 +        * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
1069 +        */
1070 +
1071 +       count =
1072 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
1073 +                                 acpi_parse_lapic_addr_ovr, 0);
1074 +       if (count < 0) {
1075 +               printk(KERN_ERR PREFIX
1076 +                      "Error parsing LAPIC address override entry\n");
1077 +               return count;
1078 +       }
1079 +
1080 +       mp_register_lapic_address(acpi_lapic_addr);
1081 +
1082 +       count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
1083 +                                     MAX_APICS);
1084 +       if (!count) {
1085 +               printk(KERN_ERR PREFIX "No LAPIC entries present\n");
1086 +               /* TBD: Cleanup to allow fallback to MPS */
1087 +               return -ENODEV;
1088 +       } else if (count < 0) {
1089 +               printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
1090 +               /* TBD: Cleanup to allow fallback to MPS */
1091 +               return count;
1092 +       }
1093 +
1094 +       count =
1095 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
1096 +       if (count < 0) {
1097 +               printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
1098 +               /* TBD: Cleanup to allow fallback to MPS */
1099 +               return count;
1100 +       }
1101 +       return 0;
1102 +}
1103 +#endif                         /* CONFIG_X86_LOCAL_APIC */
1104 +
1105 +#ifdef CONFIG_X86_IO_APIC
1106 +/*
1107 + * Parse IOAPIC related entries in MADT
1108 + * returns 0 on success, < 0 on error
1109 + */
1110 +static int __init acpi_parse_madt_ioapic_entries(void)
1111 +{
1112 +       int count;
1113 +
1114 +       /*
1115 +        * ACPI interpreter is required to complete interrupt setup,
1116 +        * so if it is off, don't enumerate the io-apics with ACPI.
1117 +        * If MPS is present, it will handle them,
1118 +        * otherwise the system will stay in PIC mode
1119 +        */
1120 +       if (acpi_disabled || acpi_noirq) {
1121 +               return -ENODEV;
1122 +       }
1123 +
1124 +       /*
1125 +        * if "noapic" boot option, don't look for IO-APICs
1126 +        */
1127 +       if (skip_ioapic_setup) {
1128 +               printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
1129 +                      "due to 'noapic' option.\n");
1130 +               return -ENODEV;
1131 +       }
1132 +
1133 +       count =
1134 +           acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
1135 +                                 MAX_IO_APICS);
1136 +       if (!count) {
1137 +               printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
1138 +               return -ENODEV;
1139 +       } else if (count < 0) {
1140 +               printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
1141 +               return count;
1142 +       }
1143 +
1144 +       count =
1145 +           acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
1146 +                                 NR_IRQ_VECTORS);
1147 +       if (count < 0) {
1148 +               printk(KERN_ERR PREFIX
1149 +                      "Error parsing interrupt source overrides entry\n");
1150 +               /* TBD: Cleanup to allow fallback to MPS */
1151 +               return count;
1152 +       }
1153 +
1154 +       /*
1155 +        * If BIOS did not supply an INT_SRC_OVR for the SCI
1156 +        * pretend we got one so we can set the SCI flags.
1157 +        */
1158 +       if (!acpi_sci_override_gsi)
1159 +               acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
1160 +
1161 +       /* Fill in identity legacy mapings where no override */
1162 +       mp_config_acpi_legacy_irqs();
1163 +
1164 +       count =
1165 +           acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
1166 +                                 NR_IRQ_VECTORS);
1167 +       if (count < 0) {
1168 +               printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1169 +               /* TBD: Cleanup to allow fallback to MPS */
1170 +               return count;
1171 +       }
1172 +
1173 +       return 0;
1174 +}
1175 +#else
1176 +static inline int acpi_parse_madt_ioapic_entries(void)
1177 +{
1178 +       return -1;
1179 +}
1180 +#endif /* !CONFIG_X86_IO_APIC */
1181 +
1182 +static void __init acpi_process_madt(void)
1183 +{
1184 +#ifdef CONFIG_X86_LOCAL_APIC
1185 +       int count, error;
1186 +
1187 +       count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
1188 +       if (count >= 1) {
1189 +
1190 +               /*
1191 +                * Parse MADT LAPIC entries
1192 +                */
1193 +               error = acpi_parse_madt_lapic_entries();
1194 +               if (!error) {
1195 +                       acpi_lapic = 1;
1196 +
1197 +#ifdef CONFIG_X86_GENERICARCH
1198 +                       generic_bigsmp_probe();
1199 +#endif
1200 +                       /*
1201 +                        * Parse MADT IO-APIC entries
1202 +                        */
1203 +                       error = acpi_parse_madt_ioapic_entries();
1204 +                       if (!error) {
1205 +                               acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
1206 +                               acpi_irq_balance_set(NULL);
1207 +                               acpi_ioapic = 1;
1208 +
1209 +                               smp_found_config = 1;
1210 +                               clustered_apic_check();
1211 +                       }
1212 +               }
1213 +               if (error == -EINVAL) {
1214 +                       /*
1215 +                        * Dell Precision Workstation 410, 610 come here.
1216 +                        */
1217 +                       printk(KERN_ERR PREFIX
1218 +                              "Invalid BIOS MADT, disabling ACPI\n");
1219 +                       disable_acpi();
1220 +               }
1221 +       }
1222 +#endif
1223 +       return;
1224 +}
1225 +
1226 +extern int acpi_force;
1227 +
1228 +#ifdef __i386__
1229 +
1230 +static int __init disable_acpi_irq(struct dmi_system_id *d)
1231 +{
1232 +       if (!acpi_force) {
1233 +               printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
1234 +                      d->ident);
1235 +               acpi_noirq_set();
1236 +       }
1237 +       return 0;
1238 +}
1239 +
1240 +static int __init disable_acpi_pci(struct dmi_system_id *d)
1241 +{
1242 +       if (!acpi_force) {
1243 +               printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
1244 +                      d->ident);
1245 +               acpi_disable_pci();
1246 +       }
1247 +       return 0;
1248 +}
1249 +
1250 +static int __init dmi_disable_acpi(struct dmi_system_id *d)
1251 +{
1252 +       if (!acpi_force) {
1253 +               printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
1254 +               disable_acpi();
1255 +       } else {
1256 +               printk(KERN_NOTICE
1257 +                      "Warning: DMI blacklist says broken, but acpi forced\n");
1258 +       }
1259 +       return 0;
1260 +}
1261 +
1262 +/*
1263 + * Limit ACPI to CPU enumeration for HT
1264 + */
1265 +static int __init force_acpi_ht(struct dmi_system_id *d)
1266 +{
1267 +       if (!acpi_force) {
1268 +               printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1269 +                      d->ident);
1270 +               disable_acpi();
1271 +               acpi_ht = 1;
1272 +       } else {
1273 +               printk(KERN_NOTICE
1274 +                      "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1275 +       }
1276 +       return 0;
1277 +}
1278 +
1279 +/*
1280 + * If your system is blacklisted here, but you find that acpi=force
1281 + * works for you, please contact acpi-devel@sourceforge.net
1282 + */
1283 +static struct dmi_system_id __initdata acpi_dmi_table[] = {
1284 +       /*
1285 +        * Boxes that need ACPI disabled
1286 +        */
1287 +       {
1288 +        .callback = dmi_disable_acpi,
1289 +        .ident = "IBM Thinkpad",
1290 +        .matches = {
1291 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1292 +                    DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
1293 +                    },
1294 +        },
1295 +
1296 +       /*
1297 +        * Boxes that need acpi=ht
1298 +        */
1299 +       {
1300 +        .callback = force_acpi_ht,
1301 +        .ident = "FSC Primergy T850",
1302 +        .matches = {
1303 +                    DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1304 +                    DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1305 +                    },
1306 +        },
1307 +       {
1308 +        .callback = force_acpi_ht,
1309 +        .ident = "DELL GX240",
1310 +        .matches = {
1311 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
1312 +                    DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
1313 +                    },
1314 +        },
1315 +       {
1316 +        .callback = force_acpi_ht,
1317 +        .ident = "HP VISUALIZE NT Workstation",
1318 +        .matches = {
1319 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1320 +                    DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1321 +                    },
1322 +        },
1323 +       {
1324 +        .callback = force_acpi_ht,
1325 +        .ident = "Compaq Workstation W8000",
1326 +        .matches = {
1327 +                    DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1328 +                    DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1329 +                    },
1330 +        },
1331 +       {
1332 +        .callback = force_acpi_ht,
1333 +        .ident = "ASUS P4B266",
1334 +        .matches = {
1335 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1336 +                    DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1337 +                    },
1338 +        },
1339 +       {
1340 +        .callback = force_acpi_ht,
1341 +        .ident = "ASUS P2B-DS",
1342 +        .matches = {
1343 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1344 +                    DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1345 +                    },
1346 +        },
1347 +       {
1348 +        .callback = force_acpi_ht,
1349 +        .ident = "ASUS CUR-DLS",
1350 +        .matches = {
1351 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1352 +                    DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1353 +                    },
1354 +        },
1355 +       {
1356 +        .callback = force_acpi_ht,
1357 +        .ident = "ABIT i440BX-W83977",
1358 +        .matches = {
1359 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1360 +                    DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1361 +                    },
1362 +        },
1363 +       {
1364 +        .callback = force_acpi_ht,
1365 +        .ident = "IBM Bladecenter",
1366 +        .matches = {
1367 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1368 +                    DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1369 +                    },
1370 +        },
1371 +       {
1372 +        .callback = force_acpi_ht,
1373 +        .ident = "IBM eServer xSeries 360",
1374 +        .matches = {
1375 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1376 +                    DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1377 +                    },
1378 +        },
1379 +       {
1380 +        .callback = force_acpi_ht,
1381 +        .ident = "IBM eserver xSeries 330",
1382 +        .matches = {
1383 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1384 +                    DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1385 +                    },
1386 +        },
1387 +       {
1388 +        .callback = force_acpi_ht,
1389 +        .ident = "IBM eserver xSeries 440",
1390 +        .matches = {
1391 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1392 +                    DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1393 +                    },
1394 +        },
1395 +
1396 +       /*
1397 +        * Boxes that need ACPI PCI IRQ routing disabled
1398 +        */
1399 +       {
1400 +        .callback = disable_acpi_irq,
1401 +        .ident = "ASUS A7V",
1402 +        .matches = {
1403 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
1404 +                    DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
1405 +                    /* newer BIOS, Revision 1011, does work */
1406 +                    DMI_MATCH(DMI_BIOS_VERSION,
1407 +                              "ASUS A7V ACPI BIOS Revision 1007"),
1408 +                    },
1409 +        },
1410 +
1411 +       /*
1412 +        * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
1413 +        */
1414 +       {                       /* _BBN 0 bug */
1415 +        .callback = disable_acpi_pci,
1416 +        .ident = "ASUS PR-DLS",
1417 +        .matches = {
1418 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1419 +                    DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
1420 +                    DMI_MATCH(DMI_BIOS_VERSION,
1421 +                              "ASUS PR-DLS ACPI BIOS Revision 1010"),
1422 +                    DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
1423 +                    },
1424 +        },
1425 +       {
1426 +        .callback = disable_acpi_pci,
1427 +        .ident = "Acer TravelMate 36x Laptop",
1428 +        .matches = {
1429 +                    DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1430 +                    DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1431 +                    },
1432 +        },
1433 +       {}
1434 +};
1435 +
1436 +#endif                         /* __i386__ */
1437 +
1438 +/*
1439 + * acpi_boot_table_init() and acpi_boot_init()
1440 + *  called from setup_arch(), always.
1441 + *     1. checksums all tables
1442 + *     2. enumerates lapics
1443 + *     3. enumerates io-apics
1444 + *
1445 + * acpi_table_init() is separate to allow reading SRAT without
1446 + * other side effects.
1447 + *
1448 + * side effects of acpi_boot_init:
1449 + *     acpi_lapic = 1 if LAPIC found
1450 + *     acpi_ioapic = 1 if IOAPIC found
1451 + *     if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
1452 + *     if acpi_blacklisted() acpi_disabled = 1;
1453 + *     acpi_irq_model=...
1454 + *     ...
1455 + *
1456 + * return value: (currently ignored)
1457 + *     0: success
1458 + *     !0: failure
1459 + */
1460 +
1461 +int __init acpi_boot_table_init(void)
1462 +{
1463 +       int error;
1464 +
1465 +#ifdef __i386__
1466 +       dmi_check_system(acpi_dmi_table);
1467 +#endif
1468 +
1469 +       /*
1470 +        * If acpi_disabled, bail out
1471 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1472 +        */
1473 +       if (acpi_disabled && !acpi_ht)
1474 +               return 1;
1475 +
1476 +       /* 
1477 +        * Initialize the ACPI boot-time table parser.
1478 +        */
1479 +       error = acpi_table_init();
1480 +       if (error) {
1481 +               disable_acpi();
1482 +               return error;
1483 +       }
1484 +
1485 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1486 +
1487 +       /*
1488 +        * blacklist may disable ACPI entirely
1489 +        */
1490 +       error = acpi_blacklisted();
1491 +       if (error) {
1492 +               if (acpi_force) {
1493 +                       printk(KERN_WARNING PREFIX "acpi=force override\n");
1494 +               } else {
1495 +                       printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1496 +                       disable_acpi();
1497 +                       return error;
1498 +               }
1499 +       }
1500 +
1501 +       return 0;
1502 +}
1503 +
1504 +int __init acpi_boot_init(void)
1505 +{
1506 +       /*
1507 +        * If acpi_disabled, bail out
1508 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1509 +        */
1510 +       if (acpi_disabled && !acpi_ht)
1511 +               return 1;
1512 +
1513 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1514 +
1515 +       /*
1516 +        * set sci_int and PM timer address
1517 +        */
1518 +       acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
1519 +
1520 +       /*
1521 +        * Process the Multiple APIC Description Table (MADT), if present
1522 +        */
1523 +       acpi_process_madt();
1524 +
1525 +       acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
1526 +
1527 +       return 0;
1528 +}
1529 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/acpi/Makefile tmp-linux-2.6-xen.patch/arch/i386/kernel/acpi/Makefile
1530 --- ref-linux-2.6.16.9/arch/i386/kernel/acpi/Makefile   2006-04-19 08:10:14.000000000 +0200
1531 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/acpi/Makefile      2006-04-10 00:05:52.000000000 +0200
1532 @@ -6,3 +6,7 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),)
1533  obj-y                          += cstate.o processor.o
1534  endif
1535  
1536 +ifdef CONFIG_XEN
1537 +include $(srctree)/scripts/Makefile.xen
1538 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
1539 +endif
1540 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/apic-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/apic-xen.c
1541 --- ref-linux-2.6.16.9/arch/i386/kernel/apic-xen.c      1970-01-01 01:00:00.000000000 +0100
1542 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/apic-xen.c 2006-04-10 00:05:52.000000000 +0200
1543 @@ -0,0 +1,140 @@
1544 +/*
1545 + *     Local APIC handling, local APIC timers
1546 + *
1547 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
1548 + *
1549 + *     Fixes
1550 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
1551 + *                                     thanks to Eric Gilmore
1552 + *                                     and Rolf G. Tews
1553 + *                                     for testing these extensively.
1554 + *     Maciej W. Rozycki       :       Various updates and fixes.
1555 + *     Mikael Pettersson       :       Power Management for UP-APIC.
1556 + *     Pavel Machek and
1557 + *     Mikael Pettersson       :       PM converted to driver model.
1558 + */
1559 +
1560 +#include <linux/config.h>
1561 +#include <linux/init.h>
1562 +
1563 +#include <linux/mm.h>
1564 +#include <linux/delay.h>
1565 +#include <linux/bootmem.h>
1566 +#include <linux/smp_lock.h>
1567 +#include <linux/interrupt.h>
1568 +#include <linux/mc146818rtc.h>
1569 +#include <linux/kernel_stat.h>
1570 +#include <linux/sysdev.h>
1571 +#include <linux/cpu.h>
1572 +#include <linux/module.h>
1573 +
1574 +#include <asm/atomic.h>
1575 +#include <asm/smp.h>
1576 +#include <asm/mtrr.h>
1577 +#include <asm/mpspec.h>
1578 +#include <asm/desc.h>
1579 +#include <asm/arch_hooks.h>
1580 +#include <asm/hpet.h>
1581 +#include <asm/i8253.h>
1582 +
1583 +#include <mach_apic.h>
1584 +#include <mach_ipi.h>
1585 +
1586 +#include "io_ports.h"
1587 +
1588 +#ifndef CONFIG_XEN
1589 +/*
1590 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
1591 + * IPIs in place of local APIC timers
1592 + */
1593 +static cpumask_t timer_bcast_ipi;
1594 +#endif
1595 +
1596 +/*
1597 + * Knob to control our willingness to enable the local APIC.
1598 + */
1599 +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
1600 +
1601 +/*
1602 + * Debug level
1603 + */
1604 +int apic_verbosity;
1605 +
1606 +/*
1607 + * 'what should we do if we get a hw irq event on an illegal vector'.
1608 + * each architecture has to answer this themselves.
1609 + */
1610 +void ack_bad_irq(unsigned int irq)
1611 +{
1612 +       printk("unexpected IRQ trap at vector %02x\n", irq);
1613 +       /*
1614 +        * Currently unexpected vectors happen only on SMP and APIC.
1615 +        * We _must_ ack these because every local APIC has only N
1616 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
1617 +        * holds up an irq slot - in excessive cases (when multiple
1618 +        * unexpected vectors occur) that might lock up the APIC
1619 +        * completely.
1620 +        * But only ack when the APIC is enabled -AK
1621 +        */
1622 +       if (cpu_has_apic)
1623 +               ack_APIC_irq();
1624 +}
1625 +
1626 +int get_physical_broadcast(void)
1627 +{
1628 +        return 0xff;
1629 +}
1630 +
1631 +#ifndef CONFIG_XEN
1632 +#ifndef CONFIG_SMP
1633 +static void up_apic_timer_interrupt_call(struct pt_regs *regs)
1634 +{
1635 +       int cpu = smp_processor_id();
1636 +
1637 +       /*
1638 +        * the NMI deadlock-detector uses this.
1639 +        */
1640 +       per_cpu(irq_stat, cpu).apic_timer_irqs++;
1641 +
1642 +       smp_local_timer_interrupt(regs);
1643 +}
1644 +#endif
1645 +
1646 +void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
1647 +{
1648 +       cpumask_t mask;
1649 +
1650 +       cpus_and(mask, cpu_online_map, timer_bcast_ipi);
1651 +       if (!cpus_empty(mask)) {
1652 +#ifdef CONFIG_SMP
1653 +               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
1654 +#else
1655 +               /*
1656 +                * We can directly call the apic timer interrupt handler
1657 +                * in UP case. Minus all irq related functions
1658 +                */
1659 +               up_apic_timer_interrupt_call(regs);
1660 +#endif
1661 +       }
1662 +}
1663 +#endif
1664 +
1665 +int setup_profiling_timer(unsigned int multiplier)
1666 +{
1667 +       return -EINVAL;
1668 +}
1669 +
1670 +/*
1671 + * This initializes the IO-APIC and APIC hardware if this is
1672 + * a UP kernel.
1673 + */
1674 +int __init APIC_init_uniprocessor (void)
1675 +{
1676 +#ifdef CONFIG_X86_IO_APIC
1677 +       if (smp_found_config)
1678 +               if (!skip_ioapic_setup && nr_ioapics)
1679 +                       setup_IO_APIC();
1680 +#endif
1681 +
1682 +       return 0;
1683 +}
1684 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/asm-offsets.c tmp-linux-2.6-xen.patch/arch/i386/kernel/asm-offsets.c
1685 --- ref-linux-2.6.16.9/arch/i386/kernel/asm-offsets.c   2006-04-19 08:10:14.000000000 +0200
1686 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/asm-offsets.c      2006-04-10 00:05:52.000000000 +0200
1687 @@ -13,6 +13,7 @@
1688  #include <asm/fixmap.h>
1689  #include <asm/processor.h>
1690  #include <asm/thread_info.h>
1691 +#include <asm/elf.h>
1692  
1693  #define DEFINE(sym, val) \
1694          asm volatile("\n->" #sym " %0 " #val : : "i" (val))
1695 @@ -63,10 +64,12 @@ void foo(void)
1696         OFFSET(pbe_orig_address, pbe, orig_address);
1697         OFFSET(pbe_next, pbe, next);
1698  
1699 +#ifdef CONFIG_X86_SYSENTER
1700         /* Offset from the sysenter stack to tss.esp0 */
1701         DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
1702                  sizeof(struct tss_struct));
1703 +#endif
1704  
1705         DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
1706 -       DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
1707 +       DEFINE(VSYSCALL_BASE, VSYSCALL_BASE);
1708  }
1709 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/cpu/common-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/common-xen.c
1710 --- ref-linux-2.6.16.9/arch/i386/kernel/cpu/common-xen.c        1970-01-01 01:00:00.000000000 +0100
1711 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/common-xen.c   2006-04-10 00:05:52.000000000 +0200
1712 @@ -0,0 +1,715 @@
1713 +#include <linux/init.h>
1714 +#include <linux/string.h>
1715 +#include <linux/delay.h>
1716 +#include <linux/smp.h>
1717 +#include <linux/module.h>
1718 +#include <linux/percpu.h>
1719 +#include <linux/bootmem.h>
1720 +#include <asm/semaphore.h>
1721 +#include <asm/processor.h>
1722 +#include <asm/i387.h>
1723 +#include <asm/msr.h>
1724 +#include <asm/io.h>
1725 +#include <asm/mmu_context.h>
1726 +#ifdef CONFIG_X86_LOCAL_APIC
1727 +#include <asm/mpspec.h>
1728 +#include <asm/apic.h>
1729 +#include <mach_apic.h>
1730 +#endif
1731 +#include <asm/hypervisor.h>
1732 +
1733 +#include "cpu.h"
1734 +
1735 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
1736 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
1737 +
1738 +#ifndef CONFIG_XEN
1739 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
1740 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
1741 +#endif
1742 +
1743 +static int cachesize_override __devinitdata = -1;
1744 +static int disable_x86_fxsr __devinitdata = 0;
1745 +static int disable_x86_serial_nr __devinitdata = 1;
1746 +
1747 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
1748 +
1749 +extern int disable_pse;
1750 +
1751 +static void default_init(struct cpuinfo_x86 * c)
1752 +{
1753 +       /* Not much we can do here... */
1754 +       /* Check if at least it has cpuid */
1755 +       if (c->cpuid_level == -1) {
1756 +               /* No cpuid. It must be an ancient CPU */
1757 +               if (c->x86 == 4)
1758 +                       strcpy(c->x86_model_id, "486");
1759 +               else if (c->x86 == 3)
1760 +                       strcpy(c->x86_model_id, "386");
1761 +       }
1762 +}
1763 +
1764 +static struct cpu_dev default_cpu = {
1765 +       .c_init = default_init,
1766 +       .c_vendor = "Unknown",
1767 +};
1768 +static struct cpu_dev * this_cpu = &default_cpu;
1769 +
1770 +static int __init cachesize_setup(char *str)
1771 +{
1772 +       get_option (&str, &cachesize_override);
1773 +       return 1;
1774 +}
1775 +__setup("cachesize=", cachesize_setup);
1776 +
1777 +int __devinit get_model_name(struct cpuinfo_x86 *c)
1778 +{
1779 +       unsigned int *v;
1780 +       char *p, *q;
1781 +
1782 +       if (cpuid_eax(0x80000000) < 0x80000004)
1783 +               return 0;
1784 +
1785 +       v = (unsigned int *) c->x86_model_id;
1786 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
1787 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
1788 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
1789 +       c->x86_model_id[48] = 0;
1790 +
1791 +       /* Intel chips right-justify this string for some dumb reason;
1792 +          undo that brain damage */
1793 +       p = q = &c->x86_model_id[0];
1794 +       while ( *p == ' ' )
1795 +            p++;
1796 +       if ( p != q ) {
1797 +            while ( *p )
1798 +                 *q++ = *p++;
1799 +            while ( q <= &c->x86_model_id[48] )
1800 +                 *q++ = '\0';  /* Zero-pad the rest */
1801 +       }
1802 +
1803 +       return 1;
1804 +}
1805 +
1806 +
1807 +void __devinit display_cacheinfo(struct cpuinfo_x86 *c)
1808 +{
1809 +       unsigned int n, dummy, ecx, edx, l2size;
1810 +
1811 +       n = cpuid_eax(0x80000000);
1812 +
1813 +       if (n >= 0x80000005) {
1814 +               cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
1815 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
1816 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
1817 +               c->x86_cache_size=(ecx>>24)+(edx>>24);  
1818 +       }
1819 +
1820 +       if (n < 0x80000006)     /* Some chips just has a large L1. */
1821 +               return;
1822 +
1823 +       ecx = cpuid_ecx(0x80000006);
1824 +       l2size = ecx >> 16;
1825 +       
1826 +       /* do processor-specific cache resizing */
1827 +       if (this_cpu->c_size_cache)
1828 +               l2size = this_cpu->c_size_cache(c,l2size);
1829 +
1830 +       /* Allow user to override all this if necessary. */
1831 +       if (cachesize_override != -1)
1832 +               l2size = cachesize_override;
1833 +
1834 +       if ( l2size == 0 )
1835 +               return;         /* Again, no L2 cache is possible */
1836 +
1837 +       c->x86_cache_size = l2size;
1838 +
1839 +       printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
1840 +              l2size, ecx & 0xFF);
1841 +}
1842 +
1843 +/* Naming convention should be: <Name> [(<Codename>)] */
1844 +/* This table only is used unless init_<vendor>() below doesn't set it; */
1845 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
1846 +
1847 +/* Look up CPU names by table lookup. */
1848 +static char __devinit *table_lookup_model(struct cpuinfo_x86 *c)
1849 +{
1850 +       struct cpu_model_info *info;
1851 +
1852 +       if ( c->x86_model >= 16 )
1853 +               return NULL;    /* Range check */
1854 +
1855 +       if (!this_cpu)
1856 +               return NULL;
1857 +
1858 +       info = this_cpu->c_models;
1859 +
1860 +       while (info && info->family) {
1861 +               if (info->family == c->x86)
1862 +                       return info->model_names[c->x86_model];
1863 +               info++;
1864 +       }
1865 +       return NULL;            /* Not found */
1866 +}
1867 +
1868 +
1869 +static void __devinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
1870 +{
1871 +       char *v = c->x86_vendor_id;
1872 +       int i;
1873 +       static int printed;
1874 +
1875 +       for (i = 0; i < X86_VENDOR_NUM; i++) {
1876 +               if (cpu_devs[i]) {
1877 +                       if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
1878 +                           (cpu_devs[i]->c_ident[1] && 
1879 +                            !strcmp(v,cpu_devs[i]->c_ident[1]))) {
1880 +                               c->x86_vendor = i;
1881 +                               if (!early)
1882 +                                       this_cpu = cpu_devs[i];
1883 +                               return;
1884 +                       }
1885 +               }
1886 +       }
1887 +       if (!printed) {
1888 +               printed++;
1889 +               printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
1890 +               printk(KERN_ERR "CPU: Your system may be unstable.\n");
1891 +       }
1892 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
1893 +       this_cpu = &default_cpu;
1894 +}
1895 +
1896 +
1897 +static int __init x86_fxsr_setup(char * s)
1898 +{
1899 +       disable_x86_fxsr = 1;
1900 +       return 1;
1901 +}
1902 +__setup("nofxsr", x86_fxsr_setup);
1903 +
1904 +
1905 +/* Standard macro to see if a specific flag is changeable */
1906 +static inline int flag_is_changeable_p(u32 flag)
1907 +{
1908 +       u32 f1, f2;
1909 +
1910 +       asm("pushfl\n\t"
1911 +           "pushfl\n\t"
1912 +           "popl %0\n\t"
1913 +           "movl %0,%1\n\t"
1914 +           "xorl %2,%0\n\t"
1915 +           "pushl %0\n\t"
1916 +           "popfl\n\t"
1917 +           "pushfl\n\t"
1918 +           "popl %0\n\t"
1919 +           "popfl\n\t"
1920 +           : "=&r" (f1), "=&r" (f2)
1921 +           : "ir" (flag));
1922 +
1923 +       return ((f1^f2) & flag) != 0;
1924 +}
1925 +
1926 +
1927 +/* Probe for the CPUID instruction */
1928 +static int __devinit have_cpuid_p(void)
1929 +{
1930 +       return flag_is_changeable_p(X86_EFLAGS_ID);
1931 +}
1932 +
1933 +/* Do minimum CPU detection early.
1934 +   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
1935 +   The others are not touched to avoid unwanted side effects.
1936 +
1937 +   WARNING: this function is only called on the BP.  Don't add code here
1938 +   that is supposed to run on all CPUs. */
1939 +static void __init early_cpu_detect(void)
1940 +{
1941 +       struct cpuinfo_x86 *c = &boot_cpu_data;
1942 +
1943 +       c->x86_cache_alignment = 32;
1944 +
1945 +       if (!have_cpuid_p())
1946 +               return;
1947 +
1948 +       /* Get vendor name */
1949 +       cpuid(0x00000000, &c->cpuid_level,
1950 +             (int *)&c->x86_vendor_id[0],
1951 +             (int *)&c->x86_vendor_id[8],
1952 +             (int *)&c->x86_vendor_id[4]);
1953 +
1954 +       get_cpu_vendor(c, 1);
1955 +
1956 +       c->x86 = 4;
1957 +       if (c->cpuid_level >= 0x00000001) {
1958 +               u32 junk, tfms, cap0, misc;
1959 +               cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
1960 +               c->x86 = (tfms >> 8) & 15;
1961 +               c->x86_model = (tfms >> 4) & 15;
1962 +               if (c->x86 == 0xf)
1963 +                       c->x86 += (tfms >> 20) & 0xff;
1964 +               if (c->x86 >= 0x6)
1965 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
1966 +               c->x86_mask = tfms & 15;
1967 +               if (cap0 & (1<<19))
1968 +                       c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
1969 +       }
1970 +}
1971 +
1972 +void __devinit generic_identify(struct cpuinfo_x86 * c)
1973 +{
1974 +       u32 tfms, xlvl;
1975 +       int junk;
1976 +
1977 +       if (have_cpuid_p()) {
1978 +               /* Get vendor name */
1979 +               cpuid(0x00000000, &c->cpuid_level,
1980 +                     (int *)&c->x86_vendor_id[0],
1981 +                     (int *)&c->x86_vendor_id[8],
1982 +                     (int *)&c->x86_vendor_id[4]);
1983 +               
1984 +               get_cpu_vendor(c, 0);
1985 +               /* Initialize the standard set of capabilities */
1986 +               /* Note that the vendor-specific code below might override */
1987 +       
1988 +               /* Intel-defined flags: level 0x00000001 */
1989 +               if ( c->cpuid_level >= 0x00000001 ) {
1990 +                       u32 capability, excap;
1991 +                       cpuid(0x00000001, &tfms, &junk, &excap, &capability);
1992 +                       c->x86_capability[0] = capability;
1993 +                       c->x86_capability[4] = excap;
1994 +                       c->x86 = (tfms >> 8) & 15;
1995 +                       c->x86_model = (tfms >> 4) & 15;
1996 +                       if (c->x86 == 0xf)
1997 +                               c->x86 += (tfms >> 20) & 0xff;
1998 +                       if (c->x86 >= 0x6)
1999 +                               c->x86_model += ((tfms >> 16) & 0xF) << 4;
2000 +                       c->x86_mask = tfms & 15;
2001 +               } else {
2002 +                       /* Have CPUID level 0 only - unheard of */
2003 +                       c->x86 = 4;
2004 +               }
2005 +
2006 +               /* AMD-defined flags: level 0x80000001 */
2007 +               xlvl = cpuid_eax(0x80000000);
2008 +               if ( (xlvl & 0xffff0000) == 0x80000000 ) {
2009 +                       if ( xlvl >= 0x80000001 ) {
2010 +                               c->x86_capability[1] = cpuid_edx(0x80000001);
2011 +                               c->x86_capability[6] = cpuid_ecx(0x80000001);
2012 +                       }
2013 +                       if ( xlvl >= 0x80000004 )
2014 +                               get_model_name(c); /* Default name */
2015 +               }
2016 +       }
2017 +
2018 +       early_intel_workaround(c);
2019 +
2020 +#ifdef CONFIG_X86_HT
2021 +       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
2022 +#endif
2023 +}
2024 +
2025 +static void __devinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
2026 +{
2027 +       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
2028 +               /* Disable processor serial number */
2029 +               unsigned long lo,hi;
2030 +               rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2031 +               lo |= 0x200000;
2032 +               wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2033 +               printk(KERN_NOTICE "CPU serial number disabled.\n");
2034 +               clear_bit(X86_FEATURE_PN, c->x86_capability);
2035 +
2036 +               /* Disabling the serial number may affect the cpuid level */
2037 +               c->cpuid_level = cpuid_eax(0);
2038 +       }
2039 +}
2040 +
2041 +static int __init x86_serial_nr_setup(char *s)
2042 +{
2043 +       disable_x86_serial_nr = 0;
2044 +       return 1;
2045 +}
2046 +__setup("serialnumber", x86_serial_nr_setup);
2047 +
2048 +
2049 +
2050 +/*
2051 + * This does the hard work of actually picking apart the CPU stuff...
2052 + */
2053 +void __devinit identify_cpu(struct cpuinfo_x86 *c)
2054 +{
2055 +       int i;
2056 +
2057 +       c->loops_per_jiffy = loops_per_jiffy;
2058 +       c->x86_cache_size = -1;
2059 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
2060 +       c->cpuid_level = -1;    /* CPUID not detected */
2061 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
2062 +       c->x86_vendor_id[0] = '\0'; /* Unset */
2063 +       c->x86_model_id[0] = '\0';  /* Unset */
2064 +       c->x86_max_cores = 1;
2065 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
2066 +
2067 +       if (!have_cpuid_p()) {
2068 +               /* First of all, decide if this is a 486 or higher */
2069 +               /* It's a 486 if we can modify the AC flag */
2070 +               if ( flag_is_changeable_p(X86_EFLAGS_AC) )
2071 +                       c->x86 = 4;
2072 +               else
2073 +                       c->x86 = 3;
2074 +       }
2075 +
2076 +       generic_identify(c);
2077 +
2078 +       printk(KERN_DEBUG "CPU: After generic identify, caps:");
2079 +       for (i = 0; i < NCAPINTS; i++)
2080 +               printk(" %08lx", c->x86_capability[i]);
2081 +       printk("\n");
2082 +
2083 +       if (this_cpu->c_identify) {
2084 +               this_cpu->c_identify(c);
2085 +
2086 +               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
2087 +               for (i = 0; i < NCAPINTS; i++)
2088 +                       printk(" %08lx", c->x86_capability[i]);
2089 +               printk("\n");
2090 +       }
2091 +
2092 +       /*
2093 +        * Vendor-specific initialization.  In this section we
2094 +        * canonicalize the feature flags, meaning if there are
2095 +        * features a certain CPU supports which CPUID doesn't
2096 +        * tell us, CPUID claiming incorrect flags, or other bugs,
2097 +        * we handle them here.
2098 +        *
2099 +        * At the end of this section, c->x86_capability better
2100 +        * indicate the features this CPU genuinely supports!
2101 +        */
2102 +       if (this_cpu->c_init)
2103 +               this_cpu->c_init(c);
2104 +
2105 +       /* Disable the PN if appropriate */
2106 +       squash_the_stupid_serial_number(c);
2107 +
2108 +       /*
2109 +        * The vendor-specific functions might have changed features.  Now
2110 +        * we do "generic changes."
2111 +        */
2112 +
2113 +       /* TSC disabled? */
2114 +       if ( tsc_disable )
2115 +               clear_bit(X86_FEATURE_TSC, c->x86_capability);
2116 +
2117 +       /* FXSR disabled? */
2118 +       if (disable_x86_fxsr) {
2119 +               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
2120 +               clear_bit(X86_FEATURE_XMM, c->x86_capability);
2121 +       }
2122 +
2123 +       if (disable_pse)
2124 +               clear_bit(X86_FEATURE_PSE, c->x86_capability);
2125 +
2126 +       /* If the model name is still unset, do table lookup. */
2127 +       if ( !c->x86_model_id[0] ) {
2128 +               char *p;
2129 +               p = table_lookup_model(c);
2130 +               if ( p )
2131 +                       strcpy(c->x86_model_id, p);
2132 +               else
2133 +                       /* Last resort... */
2134 +                       sprintf(c->x86_model_id, "%02x/%02x",
2135 +                               c->x86_vendor, c->x86_model);
2136 +       }
2137 +
2138 +       /* Now the feature flags better reflect actual CPU features! */
2139 +
2140 +       printk(KERN_DEBUG "CPU: After all inits, caps:");
2141 +       for (i = 0; i < NCAPINTS; i++)
2142 +               printk(" %08lx", c->x86_capability[i]);
2143 +       printk("\n");
2144 +
2145 +       /*
2146 +        * On SMP, boot_cpu_data holds the common feature set between
2147 +        * all CPUs; so make sure that we indicate which features are
2148 +        * common between the CPUs.  The first time this routine gets
2149 +        * executed, c == &boot_cpu_data.
2150 +        */
2151 +       if ( c != &boot_cpu_data ) {
2152 +               /* AND the already accumulated flags with these */
2153 +               for ( i = 0 ; i < NCAPINTS ; i++ )
2154 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
2155 +       }
2156 +
2157 +       /* Init Machine Check Exception if available. */
2158 +       mcheck_init(c);
2159 +
2160 +       if (c == &boot_cpu_data)
2161 +               sysenter_setup();
2162 +       enable_sep_cpu();
2163 +
2164 +       if (c == &boot_cpu_data)
2165 +               mtrr_bp_init();
2166 +       else
2167 +               mtrr_ap_init();
2168 +}
2169 +
2170 +#ifdef CONFIG_X86_HT
2171 +void __devinit detect_ht(struct cpuinfo_x86 *c)
2172 +{
2173 +       u32     eax, ebx, ecx, edx;
2174 +       int     index_msb, core_bits;
2175 +       int     cpu = smp_processor_id();
2176 +
2177 +       cpuid(1, &eax, &ebx, &ecx, &edx);
2178 +
2179 +       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
2180 +
2181 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
2182 +               return;
2183 +
2184 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
2185 +
2186 +       if (smp_num_siblings == 1) {
2187 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
2188 +       } else if (smp_num_siblings > 1 ) {
2189 +
2190 +               if (smp_num_siblings > NR_CPUS) {
2191 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
2192 +                       smp_num_siblings = 1;
2193 +                       return;
2194 +               }
2195 +
2196 +               index_msb = get_count_order(smp_num_siblings);
2197 +               phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
2198 +
2199 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
2200 +                      phys_proc_id[cpu]);
2201 +
2202 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
2203 +
2204 +               index_msb = get_count_order(smp_num_siblings) ;
2205 +
2206 +               core_bits = get_count_order(c->x86_max_cores);
2207 +
2208 +               cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
2209 +                                              ((1 << core_bits) - 1);
2210 +
2211 +               if (c->x86_max_cores > 1)
2212 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
2213 +                              cpu_core_id[cpu]);
2214 +       }
2215 +}
2216 +#endif
2217 +
2218 +void __devinit print_cpu_info(struct cpuinfo_x86 *c)
2219 +{
2220 +       char *vendor = NULL;
2221 +
2222 +       if (c->x86_vendor < X86_VENDOR_NUM)
2223 +               vendor = this_cpu->c_vendor;
2224 +       else if (c->cpuid_level >= 0)
2225 +               vendor = c->x86_vendor_id;
2226 +
2227 +       if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
2228 +               printk("%s ", vendor);
2229 +
2230 +       if (!c->x86_model_id[0])
2231 +               printk("%d86", c->x86);
2232 +       else
2233 +               printk("%s", c->x86_model_id);
2234 +
2235 +       if (c->x86_mask || c->cpuid_level >= 0) 
2236 +               printk(" stepping %02x\n", c->x86_mask);
2237 +       else
2238 +               printk("\n");
2239 +}
2240 +
2241 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
2242 +
2243 +/* This is hacky. :)
2244 + * We're emulating future behavior.
2245 + * In the future, the cpu-specific init functions will be called implicitly
2246 + * via the magic of initcalls.
2247 + * They will insert themselves into the cpu_devs structure.
2248 + * Then, when cpu_init() is called, we can just iterate over that array.
2249 + */
2250 +
2251 +extern int intel_cpu_init(void);
2252 +extern int cyrix_init_cpu(void);
2253 +extern int nsc_init_cpu(void);
2254 +extern int amd_init_cpu(void);
2255 +extern int centaur_init_cpu(void);
2256 +extern int transmeta_init_cpu(void);
2257 +extern int rise_init_cpu(void);
2258 +extern int nexgen_init_cpu(void);
2259 +extern int umc_init_cpu(void);
2260 +
2261 +void __init early_cpu_init(void)
2262 +{
2263 +       intel_cpu_init();
2264 +       cyrix_init_cpu();
2265 +       nsc_init_cpu();
2266 +       amd_init_cpu();
2267 +       centaur_init_cpu();
2268 +       transmeta_init_cpu();
2269 +       rise_init_cpu();
2270 +       nexgen_init_cpu();
2271 +       umc_init_cpu();
2272 +       early_cpu_detect();
2273 +
2274 +#ifdef CONFIG_DEBUG_PAGEALLOC
2275 +       /* pse is not compatible with on-the-fly unmapping,
2276 +        * disable it even if the cpus claim to support it.
2277 +        */
2278 +       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
2279 +       disable_pse = 1;
2280 +#endif
2281 +}
2282 +
2283 +void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
2284 +{
2285 +       unsigned long frames[16];
2286 +       unsigned long va;
2287 +       int f;
2288 +
2289 +       for (va = gdt_descr->address, f = 0;
2290 +            va < gdt_descr->address + gdt_descr->size;
2291 +            va += PAGE_SIZE, f++) {
2292 +               frames[f] = virt_to_mfn(va);
2293 +               make_lowmem_page_readonly(
2294 +                       (void *)va, XENFEAT_writable_descriptor_tables);
2295 +       }
2296 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
2297 +               BUG();
2298 +}
2299 +
2300 +/*
2301 + * cpu_init() initializes state that is per-CPU. Some data is already
2302 + * initialized (naturally) in the bootstrap process, such as the GDT
2303 + * and IDT. We reload them nevertheless, this function acts as a
2304 + * 'CPU state barrier', nothing should get across.
2305 + */
2306 +void __cpuinit cpu_init(void)
2307 +{
2308 +       int cpu = smp_processor_id();
2309 +#ifndef CONFIG_X86_NO_TSS
2310 +       struct tss_struct * t = &per_cpu(init_tss, cpu);
2311 +#endif
2312 +       struct thread_struct *thread = &current->thread;
2313 +       struct desc_struct *gdt;
2314 +       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2315 +
2316 +       if (cpu_test_and_set(cpu, cpu_initialized)) {
2317 +               printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
2318 +               for (;;) local_irq_enable();
2319 +       }
2320 +       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
2321 +
2322 +       if (cpu_has_vme || cpu_has_de)
2323 +               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
2324 +       if (tsc_disable && cpu_has_tsc) {
2325 +               printk(KERN_NOTICE "Disabling TSC...\n");
2326 +               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
2327 +               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
2328 +               set_in_cr4(X86_CR4_TSD);
2329 +       }
2330 +
2331 +#ifndef CONFIG_XEN
2332 +       /*
2333 +        * This is a horrible hack to allocate the GDT.  The problem
2334 +        * is that cpu_init() is called really early for the boot CPU
2335 +        * (and hence needs bootmem) but much later for the secondary
2336 +        * CPUs, when bootmem will have gone away
2337 +        */
2338 +       if (NODE_DATA(0)->bdata->node_bootmem_map) {
2339 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2340 +               /* alloc_bootmem_pages panics on failure, so no check */
2341 +               memset(gdt, 0, PAGE_SIZE);
2342 +       } else {
2343 +               gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
2344 +               if (unlikely(!gdt)) {
2345 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
2346 +                       for (;;)
2347 +                               local_irq_enable();
2348 +               }
2349 +       }
2350 +
2351 +       /*
2352 +        * Initialize the per-CPU GDT with the boot GDT,
2353 +        * and set up the GDT descriptor:
2354 +        */
2355 +       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2356 +
2357 +       /* Set up GDT entry for 16bit stack */
2358 +       *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
2359 +               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
2360 +               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
2361 +               (CPU_16BIT_STACK_SIZE - 1);
2362 +
2363 +       cpu_gdt_descr->size = GDT_SIZE - 1;
2364 +       cpu_gdt_descr->address = (unsigned long)gdt;
2365 +#else
2366 +       if (cpu == 0 && cpu_gdt_descr->address == 0) {
2367 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2368 +               /* alloc_bootmem_pages panics on failure, so no check */
2369 +               memset(gdt, 0, PAGE_SIZE);
2370 +
2371 +               memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2372 +               
2373 +               cpu_gdt_descr->size = GDT_SIZE;
2374 +               cpu_gdt_descr->address = (unsigned long)gdt;
2375 +       }
2376 +#endif
2377 +
2378 +       cpu_gdt_init(cpu_gdt_descr);
2379 +
2380 +       /*
2381 +        * Set up and load the per-CPU TSS and LDT
2382 +        */
2383 +       atomic_inc(&init_mm.mm_count);
2384 +       current->active_mm = &init_mm;
2385 +       if (current->mm)
2386 +               BUG();
2387 +       enter_lazy_tlb(&init_mm, current);
2388 +
2389 +       load_esp0(t, thread);
2390 +
2391 +       load_LDT(&init_mm.context);
2392 +
2393 +#ifdef CONFIG_DOUBLEFAULT
2394 +       /* Set up doublefault TSS pointer in the GDT */
2395 +       __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
2396 +#endif
2397 +
2398 +       /* Clear %fs and %gs. */
2399 +       asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
2400 +
2401 +       /* Clear all 6 debug registers: */
2402 +       set_debugreg(0, 0);
2403 +       set_debugreg(0, 1);
2404 +       set_debugreg(0, 2);
2405 +       set_debugreg(0, 3);
2406 +       set_debugreg(0, 6);
2407 +       set_debugreg(0, 7);
2408 +
2409 +       /*
2410 +        * Force FPU initialization:
2411 +        */
2412 +       current_thread_info()->status = 0;
2413 +       clear_used_math();
2414 +       mxcsr_feature_mask_init();
2415 +}
2416 +
2417 +#ifdef CONFIG_HOTPLUG_CPU
2418 +void __devinit cpu_uninit(void)
2419 +{
2420 +       int cpu = raw_smp_processor_id();
2421 +       cpu_clear(cpu, cpu_initialized);
2422 +
2423 +       /* lazy TLB state */
2424 +       per_cpu(cpu_tlbstate, cpu).state = 0;
2425 +       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
2426 +}
2427 +#endif
2428 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/cpu/Makefile tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/Makefile
2429 --- ref-linux-2.6.16.9/arch/i386/kernel/cpu/Makefile    2006-04-19 08:10:14.000000000 +0200
2430 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/Makefile       2006-04-10 00:05:52.000000000 +0200
2431 @@ -17,3 +17,8 @@ obj-$(CONFIG_X86_MCE) +=      mcheck/
2432  
2433  obj-$(CONFIG_MTRR)     +=      mtrr/
2434  obj-$(CONFIG_CPU_FREQ) +=      cpufreq/
2435 +
2436 +ifdef CONFIG_XEN
2437 +include $(srctree)/scripts/Makefile.xen
2438 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
2439 +endif
2440 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/cpu/mtrr/main-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/mtrr/main-xen.c
2441 --- ref-linux-2.6.16.9/arch/i386/kernel/cpu/mtrr/main-xen.c     1970-01-01 01:00:00.000000000 +0100
2442 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/mtrr/main-xen.c        2006-04-10 00:05:52.000000000 +0200
2443 @@ -0,0 +1,196 @@
2444 +#include <linux/init.h>
2445 +#include <linux/proc_fs.h>
2446 +#include <linux/ctype.h>
2447 +#include <linux/module.h>
2448 +#include <linux/seq_file.h>
2449 +#include <asm/uaccess.h>
2450 +
2451 +#include <asm/mtrr.h>
2452 +#include "mtrr.h"
2453 +
2454 +static DECLARE_MUTEX(mtrr_sem);
2455 +
2456 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
2457 +                     unsigned int *size, mtrr_type * type)
2458 +{
2459 +       dom0_op_t op;
2460 +
2461 +       op.cmd = DOM0_READ_MEMTYPE;
2462 +       op.u.read_memtype.reg = reg;
2463 +       (void)HYPERVISOR_dom0_op(&op);
2464 +
2465 +       *size = op.u.read_memtype.nr_mfns;
2466 +       *base = op.u.read_memtype.mfn;
2467 +       *type = op.u.read_memtype.type;
2468 +}
2469 +
2470 +struct mtrr_ops generic_mtrr_ops = {
2471 +       .use_intel_if      = 1,
2472 +       .get               = generic_get_mtrr,
2473 +};
2474 +
2475 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
2476 +unsigned int num_var_ranges;
2477 +unsigned int *usage_table;
2478 +
2479 +static void __init set_num_var_ranges(void)
2480 +{
2481 +       dom0_op_t op;
2482 +
2483 +       for (num_var_ranges = 0; ; num_var_ranges++) {
2484 +               op.cmd = DOM0_READ_MEMTYPE;
2485 +               op.u.read_memtype.reg = num_var_ranges;
2486 +               if (HYPERVISOR_dom0_op(&op) != 0)
2487 +                       break;
2488 +       }
2489 +}
2490 +
2491 +static void __init init_table(void)
2492 +{
2493 +       int i, max;
2494 +
2495 +       max = num_var_ranges;
2496 +       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
2497 +           == NULL) {
2498 +               printk(KERN_ERR "mtrr: could not allocate\n");
2499 +               return;
2500 +       }
2501 +       for (i = 0; i < max; i++)
2502 +               usage_table[i] = 0;
2503 +}
2504 +
2505 +int mtrr_add_page(unsigned long base, unsigned long size, 
2506 +                 unsigned int type, char increment)
2507 +{
2508 +       int error;
2509 +       dom0_op_t op;
2510 +
2511 +       down(&mtrr_sem);
2512 +
2513 +       op.cmd = DOM0_ADD_MEMTYPE;
2514 +       op.u.add_memtype.mfn     = base;
2515 +       op.u.add_memtype.nr_mfns = size;
2516 +       op.u.add_memtype.type    = type;
2517 +       error = HYPERVISOR_dom0_op(&op);
2518 +       if (error) {
2519 +               up(&mtrr_sem);
2520 +               BUG_ON(error > 0);
2521 +               return error;
2522 +       }
2523 +
2524 +       if (increment)
2525 +               ++usage_table[op.u.add_memtype.reg];
2526 +
2527 +       up(&mtrr_sem);
2528 +
2529 +       return op.u.add_memtype.reg;
2530 +}
2531 +
2532 +static int mtrr_check(unsigned long base, unsigned long size)
2533 +{
2534 +       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
2535 +               printk(KERN_WARNING
2536 +                       "mtrr: size and base must be multiples of 4 kiB\n");
2537 +               printk(KERN_DEBUG
2538 +                       "mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
2539 +               dump_stack();
2540 +               return -1;
2541 +       }
2542 +       return 0;
2543 +}
2544 +
2545 +int
2546 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
2547 +        char increment)
2548 +{
2549 +       if (mtrr_check(base, size))
2550 +               return -EINVAL;
2551 +       return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
2552 +                            increment);
2553 +}
2554 +
2555 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
2556 +{
2557 +       unsigned i;
2558 +       mtrr_type ltype;
2559 +       unsigned long lbase;
2560 +       unsigned int lsize;
2561 +       int error = -EINVAL;
2562 +       dom0_op_t op;
2563 +
2564 +       down(&mtrr_sem);
2565 +
2566 +       if (reg < 0) {
2567 +               /*  Search for existing MTRR  */
2568 +               for (i = 0; i < num_var_ranges; ++i) {
2569 +                       mtrr_if->get(i, &lbase, &lsize, &ltype);
2570 +                       if (lbase == base && lsize == size) {
2571 +                               reg = i;
2572 +                               break;
2573 +                       }
2574 +               }
2575 +               if (reg < 0) {
2576 +                       printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
2577 +                              size);
2578 +                       goto out;
2579 +               }
2580 +       }
2581 +       if (usage_table[reg] < 1) {
2582 +               printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
2583 +               goto out;
2584 +       }
2585 +       if (--usage_table[reg] < 1) {
2586 +               op.cmd = DOM0_DEL_MEMTYPE;
2587 +               op.u.del_memtype.handle = 0;
2588 +               op.u.del_memtype.reg    = reg;
2589 +               error = HYPERVISOR_dom0_op(&op);
2590 +               if (error) {
2591 +                       BUG_ON(error > 0);
2592 +                       goto out;
2593 +               }
2594 +       }
2595 +       error = reg;
2596 + out:
2597 +       up(&mtrr_sem);
2598 +       return error;
2599 +}
2600 +
2601 +int
2602 +mtrr_del(int reg, unsigned long base, unsigned long size)
2603 +{
2604 +       if (mtrr_check(base, size))
2605 +               return -EINVAL;
2606 +       return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
2607 +}
2608 +
2609 +EXPORT_SYMBOL(mtrr_add);
2610 +EXPORT_SYMBOL(mtrr_del);
2611 +
2612 +void __init mtrr_bp_init(void)
2613 +{
2614 +}
2615 +
2616 +void mtrr_ap_init(void)
2617 +{
2618 +}
2619 +
2620 +static int __init mtrr_init(void)
2621 +{
2622 +       struct cpuinfo_x86 *c = &boot_cpu_data;
2623 +
2624 +       if (!(xen_start_info->flags & SIF_PRIVILEGED))
2625 +               return -ENODEV;
2626 +
2627 +       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
2628 +           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
2629 +           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
2630 +           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
2631 +               return -ENODEV;
2632 +
2633 +       set_num_var_ranges();
2634 +       init_table();
2635 +
2636 +       return 0;
2637 +}
2638 +
2639 +subsys_initcall(mtrr_init);
2640 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/cpu/mtrr/Makefile tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/mtrr/Makefile
2641 --- ref-linux-2.6.16.9/arch/i386/kernel/cpu/mtrr/Makefile       2006-04-19 08:10:14.000000000 +0200
2642 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/cpu/mtrr/Makefile  2006-04-10 00:05:52.000000000 +0200
2643 @@ -3,3 +3,10 @@ obj-y          += amd.o
2644  obj-y          += cyrix.o
2645  obj-y          += centaur.o
2646  
2647 +ifdef CONFIG_XEN
2648 +include $(srctree)/scripts/Makefile.xen
2649 +n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o
2650 +
2651 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
2652 +obj-y := $(call cherrypickxen, $(obj-y))
2653 +endif
2654 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/early_printk-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/early_printk-xen.c
2655 --- ref-linux-2.6.16.9/arch/i386/kernel/early_printk-xen.c      1970-01-01 01:00:00.000000000 +0100
2656 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/early_printk-xen.c 2006-04-10 00:05:52.000000000 +0200
2657 @@ -0,0 +1,2 @@
2658 +
2659 +#include "../../x86_64/kernel/early_printk-xen.c"
2660 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/entry-xen.S tmp-linux-2.6-xen.patch/arch/i386/kernel/entry-xen.S
2661 --- ref-linux-2.6.16.9/arch/i386/kernel/entry-xen.S     1970-01-01 01:00:00.000000000 +0100
2662 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/entry-xen.S        2006-04-10 00:05:52.000000000 +0200
2663 @@ -0,0 +1,876 @@
2664 +/*
2665 + *  linux/arch/i386/entry.S
2666 + *
2667 + *  Copyright (C) 1991, 1992  Linus Torvalds
2668 + */
2669 +
2670 +/*
2671 + * entry.S contains the system-call and fault low-level handling routines.
2672 + * This also contains the timer-interrupt handler, as well as all interrupts
2673 + * and faults that can result in a task-switch.
2674 + *
2675 + * NOTE: This code handles signal-recognition, which happens every time
2676 + * after a timer-interrupt and after each system call.
2677 + *
2678 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
2679 + * on a 486.
2680 + *
2681 + * Stack layout in 'ret_from_system_call':
2682 + *     ptrace needs to have all regs on the stack.
2683 + *     if the order here is changed, it needs to be
2684 + *     updated in fork.c:copy_process, signal.c:do_signal,
2685 + *     ptrace.c and ptrace.h
2686 + *
2687 + *      0(%esp) - %ebx
2688 + *      4(%esp) - %ecx
2689 + *      8(%esp) - %edx
2690 + *       C(%esp) - %esi
2691 + *     10(%esp) - %edi
2692 + *     14(%esp) - %ebp
2693 + *     18(%esp) - %eax
2694 + *     1C(%esp) - %ds
2695 + *     20(%esp) - %es
2696 + *     24(%esp) - orig_eax
2697 + *     28(%esp) - %eip
2698 + *     2C(%esp) - %cs
2699 + *     30(%esp) - %eflags
2700 + *     34(%esp) - %oldesp
2701 + *     38(%esp) - %oldss
2702 + *
2703 + * "current" is in register %ebx during any slow entries.
2704 + */
2705 +
2706 +#include <linux/config.h>
2707 +#include <linux/linkage.h>
2708 +#include <asm/thread_info.h>
2709 +#include <asm/errno.h>
2710 +#include <asm/segment.h>
2711 +#include <asm/smp.h>
2712 +#include <asm/page.h>
2713 +#include <asm/desc.h>
2714 +#include "irq_vectors.h"
2715 +#include <xen/interface/xen.h>
2716 +
2717 +#define nr_syscalls ((syscall_table_size)/4)
2718 +
2719 +EBX            = 0x00
2720 +ECX            = 0x04
2721 +EDX            = 0x08
2722 +ESI            = 0x0C
2723 +EDI            = 0x10
2724 +EBP            = 0x14
2725 +EAX            = 0x18
2726 +DS             = 0x1C
2727 +ES             = 0x20
2728 +ORIG_EAX       = 0x24
2729 +EIP            = 0x28
2730 +CS             = 0x2C
2731 +EFLAGS         = 0x30
2732 +OLDESP         = 0x34
2733 +OLDSS          = 0x38
2734 +
2735 +CF_MASK                = 0x00000001
2736 +TF_MASK                = 0x00000100
2737 +IF_MASK                = 0x00000200
2738 +DF_MASK                = 0x00000400 
2739 +NT_MASK                = 0x00004000
2740 +VM_MASK                = 0x00020000
2741 +/* Pseudo-eflags. */
2742 +NMI_MASK       = 0x80000000
2743 +
2744 +#ifndef CONFIG_XEN
2745 +#define DISABLE_INTERRUPTS     cli
2746 +#define ENABLE_INTERRUPTS      sti
2747 +#else
2748 +/* Offsets into shared_info_t. */
2749 +#define evtchn_upcall_pending          /* 0 */
2750 +#define evtchn_upcall_mask             1
2751 +
2752 +#define sizeof_vcpu_shift              6
2753 +
2754 +#ifdef CONFIG_SMP
2755 +#define GET_VCPU_INFO          movl TI_cpu(%ebp),%esi                  ; \
2756 +                               shl  $sizeof_vcpu_shift,%esi            ; \
2757 +                               addl HYPERVISOR_shared_info,%esi
2758 +#else
2759 +#define GET_VCPU_INFO          movl HYPERVISOR_shared_info,%esi
2760 +#endif
2761 +
2762 +#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(%esi)
2763 +#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(%esi)
2764 +#define DISABLE_INTERRUPTS     GET_VCPU_INFO                           ; \
2765 +                               __DISABLE_INTERRUPTS
2766 +#define ENABLE_INTERRUPTS      GET_VCPU_INFO                           ; \
2767 +                               __ENABLE_INTERRUPTS
2768 +#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(%esi)
2769 +#endif
2770 +
2771 +#ifdef CONFIG_PREEMPT
2772 +#define preempt_stop           cli
2773 +#else
2774 +#define preempt_stop
2775 +#define resume_kernel          restore_nocheck
2776 +#endif
2777 +
2778 +#define SAVE_ALL \
2779 +       cld; \
2780 +       pushl %es; \
2781 +       pushl %ds; \
2782 +       pushl %eax; \
2783 +       pushl %ebp; \
2784 +       pushl %edi; \
2785 +       pushl %esi; \
2786 +       pushl %edx; \
2787 +       pushl %ecx; \
2788 +       pushl %ebx; \
2789 +       movl $(__USER_DS), %edx; \
2790 +       movl %edx, %ds; \
2791 +       movl %edx, %es;
2792 +
2793 +#define RESTORE_INT_REGS \
2794 +       popl %ebx;      \
2795 +       popl %ecx;      \
2796 +       popl %edx;      \
2797 +       popl %esi;      \
2798 +       popl %edi;      \
2799 +       popl %ebp;      \
2800 +       popl %eax
2801 +
2802 +#define RESTORE_REGS   \
2803 +       RESTORE_INT_REGS; \
2804 +1:     popl %ds;       \
2805 +2:     popl %es;       \
2806 +.section .fixup,"ax";  \
2807 +3:     movl $0,(%esp); \
2808 +       jmp 1b;         \
2809 +4:     movl $0,(%esp); \
2810 +       jmp 2b;         \
2811 +.previous;             \
2812 +.section __ex_table,"a";\
2813 +       .align 4;       \
2814 +       .long 1b,3b;    \
2815 +       .long 2b,4b;    \
2816 +.previous
2817 +
2818 +
2819 +ENTRY(ret_from_fork)
2820 +       pushl %eax
2821 +       call schedule_tail
2822 +       GET_THREAD_INFO(%ebp)
2823 +       popl %eax
2824 +       jmp syscall_exit
2825 +
2826 +/*
2827 + * Return to user mode is not as complex as all this looks,
2828 + * but we want the default path for a system call return to
2829 + * go as quickly as possible which is why some of this is
2830 + * less clear than it otherwise should be.
2831 + */
2832 +
2833 +       # userspace resumption stub bypassing syscall exit tracing
2834 +       ALIGN
2835 +ret_from_exception:
2836 +       preempt_stop
2837 +ret_from_intr:
2838 +       GET_THREAD_INFO(%ebp)
2839 +       movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
2840 +       movb CS(%esp), %al
2841 +       testl $(VM_MASK | 2), %eax
2842 +       jz resume_kernel
2843 +ENTRY(resume_userspace)
2844 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
2845 +                                       # setting need_resched or sigpending
2846 +                                       # between sampling and the iret
2847 +       movl TI_flags(%ebp), %ecx
2848 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
2849 +                                       # int/exception return?
2850 +       jne work_pending
2851 +       jmp restore_all
2852 +
2853 +#ifdef CONFIG_PREEMPT
2854 +ENTRY(resume_kernel)
2855 +       cli
2856 +       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
2857 +       jnz restore_nocheck
2858 +need_resched:
2859 +       movl TI_flags(%ebp), %ecx       # need_resched set ?
2860 +       testb $_TIF_NEED_RESCHED, %cl
2861 +       jz restore_all
2862 +       testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
2863 +       jz restore_all
2864 +       call preempt_schedule_irq
2865 +       jmp need_resched
2866 +#endif
2867 +
2868 +#ifdef CONFIG_X86_SYSENTER
2869 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
2870 +   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
2871 +
2872 +       # sysenter call handler stub
2873 +ENTRY(sysenter_entry)
2874 +       movl TSS_sysenter_esp0(%esp),%esp
2875 +sysenter_past_esp:
2876 +       sti
2877 +       pushl $(__USER_DS)
2878 +       pushl %ebp
2879 +       pushfl
2880 +       pushl $(__USER_CS)
2881 +       pushl $SYSENTER_RETURN
2882 +
2883 +/*
2884 + * Load the potential sixth argument from user stack.
2885 + * Careful about security.
2886 + */
2887 +       cmpl $__PAGE_OFFSET-3,%ebp
2888 +       jae syscall_fault
2889 +1:     movl (%ebp),%ebp
2890 +.section __ex_table,"a"
2891 +       .align 4
2892 +       .long 1b,syscall_fault
2893 +.previous
2894 +
2895 +       pushl %eax
2896 +       SAVE_ALL
2897 +       GET_THREAD_INFO(%ebp)
2898 +
2899 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2900 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2901 +       jnz syscall_trace_entry
2902 +       cmpl $(nr_syscalls), %eax
2903 +       jae syscall_badsys
2904 +       call *sys_call_table(,%eax,4)
2905 +       movl %eax,EAX(%esp)
2906 +       cli
2907 +       movl TI_flags(%ebp), %ecx
2908 +       testw $_TIF_ALLWORK_MASK, %cx
2909 +       jne syscall_exit_work
2910 +/* if something modifies registers it must also disable sysexit */
2911 +       movl EIP(%esp), %edx
2912 +       movl OLDESP(%esp), %ecx
2913 +       xorl %ebp,%ebp
2914 +       sti
2915 +       sysexit
2916 +#endif /* CONFIG_X86_SYSENTER */
2917 +
2918 +
2919 +       # system call handler stub
2920 +ENTRY(system_call)
2921 +       pushl %eax                      # save orig_eax
2922 +       SAVE_ALL
2923 +       GET_THREAD_INFO(%ebp)
2924 +                                       # system call tracing in operation / emulation
2925 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2926 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2927 +       jnz syscall_trace_entry
2928 +       cmpl $(nr_syscalls), %eax
2929 +       jae syscall_badsys
2930 +syscall_call:
2931 +       call *sys_call_table(,%eax,4)
2932 +       movl %eax,EAX(%esp)             # store the return value
2933 +syscall_exit:
2934 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
2935 +                                       # setting need_resched or sigpending
2936 +                                       # between sampling and the iret
2937 +       movl TI_flags(%ebp), %ecx
2938 +       testw $_TIF_ALLWORK_MASK, %cx   # current->work
2939 +       jne syscall_exit_work
2940 +
2941 +restore_all:
2942 +#ifndef CONFIG_XEN
2943 +       movl EFLAGS(%esp), %eax         # mix EFLAGS, SS and CS
2944 +       # Warning: OLDSS(%esp) contains the wrong/random values if we
2945 +       # are returning to the kernel.
2946 +       # See comments in process.c:copy_thread() for details.
2947 +       movb OLDSS(%esp), %ah
2948 +       movb CS(%esp), %al
2949 +       andl $(VM_MASK | (4 << 8) | 3), %eax
2950 +       cmpl $((4 << 8) | 3), %eax
2951 +       je ldt_ss                       # returning to user-space with LDT SS
2952 +restore_nocheck:
2953 +#else
2954 +restore_nocheck:
2955 +       movl EFLAGS(%esp), %eax
2956 +       testl $(VM_MASK|NMI_MASK), %eax
2957 +       jnz hypervisor_iret
2958 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
2959 +       GET_VCPU_INFO
2960 +       andb evtchn_upcall_mask(%esi),%al
2961 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
2962 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
2963 +#endif
2964 +       RESTORE_REGS
2965 +       addl $4, %esp
2966 +1:     iret
2967 +.section .fixup,"ax"
2968 +iret_exc:
2969 +#ifndef CONFIG_XEN
2970 +       sti
2971 +#endif
2972 +       pushl $0                        # no error code
2973 +       pushl $do_iret_error
2974 +       jmp error_code
2975 +.previous
2976 +.section __ex_table,"a"
2977 +       .align 4
2978 +       .long 1b,iret_exc
2979 +.previous
2980 +
2981 +#ifndef CONFIG_XEN
2982 +ldt_ss:
2983 +       larl OLDSS(%esp), %eax
2984 +       jnz restore_nocheck
2985 +       testl $0x00400000, %eax         # returning to 32bit stack?
2986 +       jnz restore_nocheck             # allright, normal return
2987 +       /* If returning to userspace with 16bit stack,
2988 +        * try to fix the higher word of ESP, as the CPU
2989 +        * won't restore it.
2990 +        * This is an "official" bug of all the x86-compatible
2991 +        * CPUs, which we can try to work around to make
2992 +        * dosemu and wine happy. */
2993 +       subl $8, %esp           # reserve space for switch16 pointer
2994 +       cli
2995 +       movl %esp, %eax
2996 +       /* Set up the 16bit stack frame with switch32 pointer on top,
2997 +        * and a switch16 pointer on top of the current frame. */
2998 +       call setup_x86_bogus_stack
2999 +       RESTORE_REGS
3000 +       lss 20+4(%esp), %esp    # switch to 16bit stack
3001 +1:     iret
3002 +.section __ex_table,"a"
3003 +       .align 4
3004 +       .long 1b,iret_exc
3005 +.previous
3006 +#else
3007 +hypervisor_iret:
3008 +       andl $~NMI_MASK, EFLAGS(%esp)
3009 +       RESTORE_REGS
3010 +       addl $4, %esp
3011 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
3012 +#endif
3013 +
3014 +       # perform work that needs to be done immediately before resumption
3015 +       ALIGN
3016 +work_pending:
3017 +       testb $_TIF_NEED_RESCHED, %cl
3018 +       jz work_notifysig
3019 +work_resched:
3020 +       call schedule
3021 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3022 +                                       # setting need_resched or sigpending
3023 +                                       # between sampling and the iret
3024 +       movl TI_flags(%ebp), %ecx
3025 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
3026 +                                       # than syscall tracing?
3027 +       jz restore_all
3028 +       testb $_TIF_NEED_RESCHED, %cl
3029 +       jnz work_resched
3030 +
3031 +work_notifysig:                                # deal with pending signals and
3032 +                                       # notify-resume requests
3033 +       testl $VM_MASK, EFLAGS(%esp)
3034 +       movl %esp, %eax
3035 +       jne work_notifysig_v86          # returning to kernel-space or
3036 +                                       # vm86-space
3037 +       xorl %edx, %edx
3038 +       call do_notify_resume
3039 +       jmp resume_userspace
3040 +
3041 +       ALIGN
3042 +work_notifysig_v86:
3043 +#ifdef CONFIG_VM86
3044 +       pushl %ecx                      # save ti_flags for do_notify_resume
3045 +       call save_v86_state             # %eax contains pt_regs pointer
3046 +       popl %ecx
3047 +       movl %eax, %esp
3048 +       xorl %edx, %edx
3049 +       call do_notify_resume
3050 +       jmp resume_userspace
3051 +#endif
3052 +
3053 +       # perform syscall exit tracing
3054 +       ALIGN
3055 +syscall_trace_entry:
3056 +       movl $-ENOSYS,EAX(%esp)
3057 +       movl %esp, %eax
3058 +       xorl %edx,%edx
3059 +       call do_syscall_trace
3060 +       cmpl $0, %eax
3061 +       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
3062 +                                       # so must skip actual syscall
3063 +       movl ORIG_EAX(%esp), %eax
3064 +       cmpl $(nr_syscalls), %eax
3065 +       jnae syscall_call
3066 +       jmp syscall_exit
3067 +
3068 +       # perform syscall exit tracing
3069 +       ALIGN
3070 +syscall_exit_work:
3071 +       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
3072 +       jz work_pending
3073 +       ENABLE_INTERRUPTS               # could let do_syscall_trace() call
3074 +                                       # schedule() instead
3075 +       movl %esp, %eax
3076 +       movl $1, %edx
3077 +       call do_syscall_trace
3078 +       jmp resume_userspace
3079 +
3080 +       ALIGN
3081 +syscall_fault:
3082 +       pushl %eax                      # save orig_eax
3083 +       SAVE_ALL
3084 +       GET_THREAD_INFO(%ebp)
3085 +       movl $-EFAULT,EAX(%esp)
3086 +       jmp resume_userspace
3087 +
3088 +       ALIGN
3089 +syscall_badsys:
3090 +       movl $-ENOSYS,EAX(%esp)
3091 +       jmp resume_userspace
3092 +
3093 +#ifndef CONFIG_XEN
3094 +#define FIXUP_ESPFIX_STACK \
3095 +       movl %esp, %eax; \
3096 +       /* switch to 32bit stack using the pointer on top of 16bit stack */ \
3097 +       lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
3098 +       /* copy data from 16bit stack to 32bit stack */ \
3099 +       call fixup_x86_bogus_stack; \
3100 +       /* put ESP to the proper location */ \
3101 +       movl %eax, %esp;
3102 +#define UNWIND_ESPFIX_STACK \
3103 +       pushl %eax; \
3104 +       movl %ss, %eax; \
3105 +       /* see if on 16bit stack */ \
3106 +       cmpw $__ESPFIX_SS, %ax; \
3107 +       jne 28f; \
3108 +       movl $__KERNEL_DS, %edx; \
3109 +       movl %edx, %ds; \
3110 +       movl %edx, %es; \
3111 +       /* switch to 32bit stack */ \
3112 +       FIXUP_ESPFIX_STACK \
3113 +28:    popl %eax;
3114 +
3115 +/*
3116 + * Build the entry stubs and pointer table with
3117 + * some assembler magic.
3118 + */
3119 +.data
3120 +ENTRY(interrupt)
3121 +.text
3122 +
3123 +vector=0
3124 +ENTRY(irq_entries_start)
3125 +.rept NR_IRQS
3126 +       ALIGN
3127 +1:     pushl $~(vector)
3128 +       jmp common_interrupt
3129 +.data
3130 +       .long 1b
3131 +.text
3132 +vector=vector+1
3133 +.endr
3134 +
3135 +       ALIGN
3136 +common_interrupt:
3137 +       SAVE_ALL
3138 +       movl %esp,%eax
3139 +       call do_IRQ
3140 +       jmp ret_from_intr
3141 +
3142 +#define BUILD_INTERRUPT(name, nr)      \
3143 +ENTRY(name)                            \
3144 +       pushl $~(nr);                   \
3145 +       SAVE_ALL                        \
3146 +       movl %esp,%eax;                 \
3147 +       call smp_/**/name;              \
3148 +       jmp ret_from_intr;
3149 +
3150 +/* The include is where all of the SMP etc. interrupts come from */
3151 +#include "entry_arch.h"
3152 +#else
3153 +#define UNWIND_ESPFIX_STACK
3154 +#endif
3155 +
3156 +ENTRY(divide_error)
3157 +       pushl $0                        # no error code
3158 +       pushl $do_divide_error
3159 +       ALIGN
3160 +error_code:
3161 +       pushl %ds
3162 +       pushl %eax
3163 +       xorl %eax, %eax
3164 +       pushl %ebp
3165 +       pushl %edi
3166 +       pushl %esi
3167 +       pushl %edx
3168 +       decl %eax                       # eax = -1
3169 +       pushl %ecx
3170 +       pushl %ebx
3171 +       cld
3172 +       pushl %es
3173 +       UNWIND_ESPFIX_STACK
3174 +       popl %ecx
3175 +       movl ES(%esp), %edi             # get the function address
3176 +       movl ORIG_EAX(%esp), %edx       # get the error code
3177 +       movl %eax, ORIG_EAX(%esp)
3178 +       movl %ecx, ES(%esp)
3179 +       movl $(__USER_DS), %ecx
3180 +       movl %ecx, %ds
3181 +       movl %ecx, %es
3182 +       movl %esp,%eax                  # pt_regs pointer
3183 +       call *%edi
3184 +       jmp ret_from_exception
3185 +
3186 +#ifdef CONFIG_XEN
3187 +# A note on the "critical region" in our callback handler.
3188 +# We want to avoid stacking callback handlers due to events occurring
3189 +# during handling of the last event. To do this, we keep events disabled
3190 +# until we've done all processing. HOWEVER, we must enable events before
3191 +# popping the stack frame (can't be done atomically) and so it would still
3192 +# be possible to get enough handler activations to overflow the stack.
3193 +# Although unlikely, bugs of that kind are hard to track down, so we'd
3194 +# like to avoid the possibility.
3195 +# So, on entry to the handler we detect whether we interrupted an
3196 +# existing activation in its critical region -- if so, we pop the current
3197 +# activation and restart the handler using the previous one.
3198 +ENTRY(hypervisor_callback)
3199 +       pushl %eax
3200 +       SAVE_ALL
3201 +       movl EIP(%esp),%eax
3202 +       cmpl $scrit,%eax
3203 +       jb   11f
3204 +       cmpl $ecrit,%eax
3205 +       jb   critical_region_fixup
3206 +11:    push %esp
3207 +       call evtchn_do_upcall
3208 +       add  $4,%esp
3209 +       jmp  ret_from_intr
3210 +
3211 +        ALIGN
3212 +restore_all_enable_events:
3213 +       __ENABLE_INTERRUPTS
3214 +scrit: /**** START OF CRITICAL REGION ****/
3215 +       __TEST_PENDING
3216 +       jnz  14f                        # process more events if necessary...
3217 +       RESTORE_REGS
3218 +       addl $4, %esp
3219 +1:     iret
3220 +.section __ex_table,"a"
3221 +       .align 4
3222 +       .long 1b,iret_exc
3223 +.previous
3224 +14:    __DISABLE_INTERRUPTS
3225 +       jmp  11b
3226 +ecrit:  /**** END OF CRITICAL REGION ****/
3227 +# [How we do the fixup]. We want to merge the current stack frame with the
3228 +# just-interrupted frame. How we do this depends on where in the critical
3229 +# region the interrupted handler was executing, and so how many saved
3230 +# registers are in each frame. We do this quickly using the lookup table
3231 +# 'critical_fixup_table'. For each byte offset in the critical region, it
3232 +# provides the number of bytes which have already been popped from the
3233 +# interrupted stack frame.
3234 +critical_region_fixup:
3235 +       addl $critical_fixup_table-scrit,%eax
3236 +       movzbl (%eax),%eax              # %eax contains num bytes popped
3237 +       cmpb $0xff,%al                  # 0xff => vcpu_info critical region
3238 +       jne  15f
3239 +       GET_THREAD_INFO(%ebp)
3240 +        xorl %eax,%eax
3241 +15:    mov  %esp,%esi
3242 +       add  %eax,%esi                  # %esi points at end of src region
3243 +       mov  %esp,%edi
3244 +       add  $0x34,%edi                 # %edi points at end of dst region
3245 +       mov  %eax,%ecx
3246 +       shr  $2,%ecx                    # convert words to bytes
3247 +       je   17f                        # skip loop if nothing to copy
3248 +16:    subl $4,%esi                    # pre-decrementing copy loop
3249 +       subl $4,%edi
3250 +       movl (%esi),%eax
3251 +       movl %eax,(%edi)
3252 +       loop 16b
3253 +17:    movl %edi,%esp                  # final %edi is top of merged stack
3254 +       jmp  11b
3255 +
3256 +critical_fixup_table:
3257 +       .byte 0xff,0xff,0xff            # testb $0xff,(%esi) = __TEST_PENDING
3258 +       .byte 0xff,0xff                 # jnz  14f
3259 +       .byte 0x00                      # pop  %ebx
3260 +       .byte 0x04                      # pop  %ecx
3261 +       .byte 0x08                      # pop  %edx
3262 +       .byte 0x0c                      # pop  %esi
3263 +       .byte 0x10                      # pop  %edi
3264 +       .byte 0x14                      # pop  %ebp
3265 +       .byte 0x18                      # pop  %eax
3266 +       .byte 0x1c                      # pop  %ds
3267 +       .byte 0x20                      # pop  %es
3268 +       .byte 0x24,0x24,0x24            # add  $4,%esp
3269 +       .byte 0x28                      # iret
3270 +       .byte 0xff,0xff,0xff,0xff       # movb $1,1(%esi)
3271 +       .byte 0x00,0x00                 # jmp  11b
3272 +
3273 +# Hypervisor uses this for application faults while it executes.
3274 +# We get here for two reasons:
3275 +#  1. Fault while reloading DS, ES, FS or GS
3276 +#  2. Fault while executing IRET
3277 +# Category 1 we fix up by reattempting the load, and zeroing the segment
3278 +# register if the load fails.
3279 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
3280 +# normal Linux return path in this case because if we use the IRET hypercall
3281 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
3282 +# We distinguish between categories by maintaining a status value in EAX.
3283 +ENTRY(failsafe_callback)
3284 +       pushl %eax
3285 +       movl $1,%eax
3286 +1:     mov 4(%esp),%ds
3287 +2:     mov 8(%esp),%es
3288 +3:     mov 12(%esp),%fs
3289 +4:     mov 16(%esp),%gs
3290 +       testl %eax,%eax
3291 +       popl %eax
3292 +       jz 5f
3293 +       addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
3294 +       jmp iret_exc
3295 +5:     addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
3296 +       pushl $0
3297 +       SAVE_ALL
3298 +       jmp ret_from_exception
3299 +.section .fixup,"ax";          \
3300 +6:     xorl %eax,%eax;         \
3301 +       movl %eax,4(%esp);      \
3302 +       jmp 1b;                 \
3303 +7:     xorl %eax,%eax;         \
3304 +       movl %eax,8(%esp);      \
3305 +       jmp 2b;                 \
3306 +8:     xorl %eax,%eax;         \
3307 +       movl %eax,12(%esp);     \
3308 +       jmp 3b;                 \
3309 +9:     xorl %eax,%eax;         \
3310 +       movl %eax,16(%esp);     \
3311 +       jmp 4b;                 \
3312 +.previous;                     \
3313 +.section __ex_table,"a";       \
3314 +       .align 4;               \
3315 +       .long 1b,6b;            \
3316 +       .long 2b,7b;            \
3317 +       .long 3b,8b;            \
3318 +       .long 4b,9b;            \
3319 +.previous
3320 +#endif
3321 +
3322 +ENTRY(coprocessor_error)
3323 +       pushl $0
3324 +       pushl $do_coprocessor_error
3325 +       jmp error_code
3326 +
3327 +ENTRY(simd_coprocessor_error)
3328 +       pushl $0
3329 +       pushl $do_simd_coprocessor_error
3330 +       jmp error_code
3331 +
3332 +ENTRY(device_not_available)
3333 +       pushl $-1                       # mark this as an int
3334 +       SAVE_ALL
3335 +#ifndef CONFIG_XEN
3336 +       movl %cr0, %eax
3337 +       testl $0x4, %eax                # EM (math emulation bit)
3338 +       je device_available_emulate
3339 +       pushl $0                        # temporary storage for ORIG_EIP
3340 +       call math_emulate
3341 +       addl $4, %esp
3342 +       jmp ret_from_exception
3343 +device_available_emulate:
3344 +#endif
3345 +       preempt_stop
3346 +       call math_state_restore
3347 +       jmp ret_from_exception
3348 +
3349 +#ifdef CONFIG_X86_SYSENTER
3350 +/*
3351 + * Debug traps and NMI can happen at the one SYSENTER instruction
3352 + * that sets up the real kernel stack. Check here, since we can't
3353 + * allow the wrong stack to be used.
3354 + *
3355 + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
3356 + * already pushed 3 words if it hits on the sysenter instruction:
3357 + * eflags, cs and eip.
3358 + *
3359 + * We just load the right stack, and push the three (known) values
3360 + * by hand onto the new stack - while updating the return eip past
3361 + * the instruction that would have done it for sysenter.
3362 + */
3363 +#define FIX_STACK(offset, ok, label)           \
3364 +       cmpw $__KERNEL_CS,4(%esp);              \
3365 +       jne ok;                                 \
3366 +label:                                         \
3367 +       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
3368 +       pushfl;                                 \
3369 +       pushl $__KERNEL_CS;                     \
3370 +       pushl $sysenter_past_esp
3371 +#endif /* CONFIG_X86_SYSENTER */
3372 +
3373 +KPROBE_ENTRY(debug)
3374 +#ifdef CONFIG_X86_SYSENTER
3375 +       cmpl $sysenter_entry,(%esp)
3376 +       jne debug_stack_correct
3377 +       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
3378 +debug_stack_correct:
3379 +#endif /* !CONFIG_X86_SYSENTER */
3380 +       pushl $-1                       # mark this as an int
3381 +       SAVE_ALL
3382 +       xorl %edx,%edx                  # error code 0
3383 +       movl %esp,%eax                  # pt_regs pointer
3384 +       call do_debug
3385 +       jmp ret_from_exception
3386 +       .previous .text
3387 +
3388 +#ifndef CONFIG_XEN
3389 +/*
3390 + * NMI is doubly nasty. It can happen _while_ we're handling
3391 + * a debug fault, and the debug fault hasn't yet been able to
3392 + * clear up the stack. So we first check whether we got  an
3393 + * NMI on the sysenter entry path, but after that we need to
3394 + * check whether we got an NMI on the debug path where the debug
3395 + * fault happened on the sysenter path.
3396 + */
3397 +ENTRY(nmi)
3398 +       pushl %eax
3399 +       movl %ss, %eax
3400 +       cmpw $__ESPFIX_SS, %ax
3401 +       popl %eax
3402 +       je nmi_16bit_stack
3403 +       cmpl $sysenter_entry,(%esp)
3404 +       je nmi_stack_fixup
3405 +       pushl %eax
3406 +       movl %esp,%eax
3407 +       /* Do not access memory above the end of our stack page,
3408 +        * it might not exist.
3409 +        */
3410 +       andl $(THREAD_SIZE-1),%eax
3411 +       cmpl $(THREAD_SIZE-20),%eax
3412 +       popl %eax
3413 +       jae nmi_stack_correct
3414 +       cmpl $sysenter_entry,12(%esp)
3415 +       je nmi_debug_stack_check
3416 +nmi_stack_correct:
3417 +       pushl %eax
3418 +       SAVE_ALL
3419 +       xorl %edx,%edx          # zero error code
3420 +       movl %esp,%eax          # pt_regs pointer
3421 +       call do_nmi
3422 +       jmp restore_all
3423 +
3424 +nmi_stack_fixup:
3425 +       FIX_STACK(12,nmi_stack_correct, 1)
3426 +       jmp nmi_stack_correct
3427 +nmi_debug_stack_check:
3428 +       cmpw $__KERNEL_CS,16(%esp)
3429 +       jne nmi_stack_correct
3430 +       cmpl $debug,(%esp)
3431 +       jb nmi_stack_correct
3432 +       cmpl $debug_esp_fix_insn,(%esp)
3433 +       ja nmi_stack_correct
3434 +       FIX_STACK(24,nmi_stack_correct, 1)
3435 +       jmp nmi_stack_correct
3436 +
3437 +nmi_16bit_stack:
3438 +       /* create the pointer to lss back */
3439 +       pushl %ss
3440 +       pushl %esp
3441 +       movzwl %sp, %esp
3442 +       addw $4, (%esp)
3443 +       /* copy the iret frame of 12 bytes */
3444 +       .rept 3
3445 +       pushl 16(%esp)
3446 +       .endr
3447 +       pushl %eax
3448 +       SAVE_ALL
3449 +       FIXUP_ESPFIX_STACK              # %eax == %esp
3450 +       xorl %edx,%edx                  # zero error code
3451 +       call do_nmi
3452 +       RESTORE_REGS
3453 +       lss 12+4(%esp), %esp            # back to 16bit stack
3454 +1:     iret
3455 +.section __ex_table,"a"
3456 +       .align 4
3457 +       .long 1b,iret_exc
3458 +.previous
3459 +#else
3460 +ENTRY(nmi)
3461 +       pushl %eax
3462 +       SAVE_ALL
3463 +       xorl %edx,%edx          # zero error code
3464 +       movl %esp,%eax          # pt_regs pointer
3465 +       call do_nmi
3466 +       orl  $NMI_MASK, EFLAGS(%esp)
3467 +       jmp restore_all
3468 +#endif
3469 +
3470 +KPROBE_ENTRY(int3)
3471 +       pushl $-1                       # mark this as an int
3472 +       SAVE_ALL
3473 +       xorl %edx,%edx          # zero error code
3474 +       movl %esp,%eax          # pt_regs pointer
3475 +       call do_int3
3476 +       jmp ret_from_exception
3477 +       .previous .text
3478 +
3479 +ENTRY(overflow)
3480 +       pushl $0
3481 +       pushl $do_overflow
3482 +       jmp error_code
3483 +
3484 +ENTRY(bounds)
3485 +       pushl $0
3486 +       pushl $do_bounds
3487 +       jmp error_code
3488 +
3489 +ENTRY(invalid_op)
3490 +       pushl $0
3491 +       pushl $do_invalid_op
3492 +       jmp error_code
3493 +
3494 +ENTRY(coprocessor_segment_overrun)
3495 +       pushl $0
3496 +       pushl $do_coprocessor_segment_overrun
3497 +       jmp error_code
3498 +
3499 +ENTRY(invalid_TSS)
3500 +       pushl $do_invalid_TSS
3501 +       jmp error_code
3502 +
3503 +ENTRY(segment_not_present)
3504 +       pushl $do_segment_not_present
3505 +       jmp error_code
3506 +
3507 +ENTRY(stack_segment)
3508 +       pushl $do_stack_segment
3509 +       jmp error_code
3510 +
3511 +KPROBE_ENTRY(general_protection)
3512 +       pushl $do_general_protection
3513 +       jmp error_code
3514 +       .previous .text
3515 +
3516 +ENTRY(alignment_check)
3517 +       pushl $do_alignment_check
3518 +       jmp error_code
3519 +
3520 +KPROBE_ENTRY(page_fault)
3521 +       pushl $do_page_fault
3522 +       jmp error_code
3523 +       .previous .text
3524 +
3525 +#ifdef CONFIG_X86_MCE
3526 +ENTRY(machine_check)
3527 +       pushl $0
3528 +       pushl machine_check_vector
3529 +       jmp error_code
3530 +#endif
3531 +
3532 +ENTRY(fixup_4gb_segment)
3533 +       pushl $do_fixup_4gb_segment
3534 +       jmp error_code
3535 +
3536 +.section .rodata,"a"
3537 +#include "syscall_table.S"
3538 +
3539 +syscall_table_size=(.-sys_call_table)
3540 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/fixup.c tmp-linux-2.6-xen.patch/arch/i386/kernel/fixup.c
3541 --- ref-linux-2.6.16.9/arch/i386/kernel/fixup.c 1970-01-01 01:00:00.000000000 +0100
3542 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/fixup.c    2006-04-10 00:05:52.000000000 +0200
3543 @@ -0,0 +1,95 @@
3544 +/******************************************************************************
3545 + * fixup.c
3546 + * 
3547 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
3548 + * Used to avoid repeated slow emulation of common instructions used by the
3549 + * user-space TLS (Thread-Local Storage) libraries.
3550 + * 
3551 + * **** NOTE ****
3552 + *  Issues with the binary rewriting have caused it to be removed. Instead
3553 + *  we rely on Xen's emulator to boot the kernel, and then print a banner
3554 + *  message recommending that the user disables /lib/tls.
3555 + * 
3556 + * Copyright (c) 2004, K A Fraser
3557 + * 
3558 + * This program is free software; you can redistribute it and/or modify
3559 + * it under the terms of the GNU General Public License as published by
3560 + * the Free Software Foundation; either version 2 of the License, or
3561 + * (at your option) any later version.
3562 + * 
3563 + * This program is distributed in the hope that it will be useful,
3564 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
3565 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
3566 + * GNU General Public License for more details.
3567 + * 
3568 + * You should have received a copy of the GNU General Public License
3569 + * along with this program; if not, write to the Free Software
3570 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
3571 + */
3572 +
3573 +#include <linux/config.h>
3574 +#include <linux/init.h>
3575 +#include <linux/sched.h>
3576 +#include <linux/slab.h>
3577 +#include <linux/kernel.h>
3578 +#include <linux/delay.h>
3579 +#include <linux/version.h>
3580 +
3581 +#define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
3582 +
3583 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3584 +{
3585 +       static unsigned long printed = 0;
3586 +       char info[100];
3587 +       int i;
3588 +
3589 +       if (test_and_set_bit(0, &printed))
3590 +               return;
3591 +
3592 +       HYPERVISOR_vm_assist(
3593 +               VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify);
3594 +
3595 +       sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
3596 +
3597 +
3598 +       DP("");
3599 +       DP("***************************************************************");
3600 +       DP("***************************************************************");
3601 +       DP("** WARNING: Currently emulating unsupported memory accesses  **");
3602 +       DP("**          in /lib/tls glibc libraries. The emulation is    **");
3603 +       DP("**          slow. To ensure full performance you should      **");
3604 +       DP("**          install a 'xen-friendly' (nosegneg) version of   **");
3605 +       DP("**          the library, or disable tls support by executing **");
3606 +       DP("**          the following as root:                           **");
3607 +       DP("**          mv /lib/tls /lib/tls.disabled                    **");
3608 +       DP("** Offending process: %-38.38s **", info);
3609 +       DP("***************************************************************");
3610 +       DP("***************************************************************");
3611 +       DP("");
3612 +
3613 +       for (i = 5; i > 0; i--) {
3614 +               printk("Pausing... %d", i);
3615 +               mdelay(1000);
3616 +               printk("\b\b\b\b\b\b\b\b\b\b\b\b");
3617 +       }
3618 +
3619 +       printk("Continuing...\n\n");
3620 +}
3621 +
3622 +static int __init fixup_init(void)
3623 +{
3624 +       HYPERVISOR_vm_assist(
3625 +               VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
3626 +       return 0;
3627 +}
3628 +__initcall(fixup_init);
3629 +
3630 +/*
3631 + * Local variables:
3632 + *  c-file-style: "linux"
3633 + *  indent-tabs-mode: t
3634 + *  c-indent-level: 8
3635 + *  c-basic-offset: 8
3636 + *  tab-width: 8
3637 + * End:
3638 + */
3639 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/head-xen.S tmp-linux-2.6-xen.patch/arch/i386/kernel/head-xen.S
3640 --- ref-linux-2.6.16.9/arch/i386/kernel/head-xen.S      1970-01-01 01:00:00.000000000 +0100
3641 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/head-xen.S 2006-04-10 00:05:52.000000000 +0200
3642 @@ -0,0 +1,171 @@
3643 +
3644 +
3645 +.text
3646 +#include <linux/config.h>
3647 +#include <linux/threads.h>
3648 +#include <linux/linkage.h>
3649 +#include <asm/segment.h>
3650 +#include <asm/page.h>
3651 +#include <asm/thread_info.h>
3652 +#include <asm/asm-offsets.h>
3653 +#include <xen/interface/arch-x86_32.h>
3654 +
3655 +/*
3656 + * References to members of the new_cpu_data structure.
3657 + */
3658 +
3659 +#define X86            new_cpu_data+CPUINFO_x86
3660 +#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
3661 +#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
3662 +#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
3663 +#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
3664 +#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
3665 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
3666 +#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
3667 +
3668 +ENTRY(startup_32)
3669 +       movl %esi,xen_start_info
3670 +       cld
3671 +
3672 +       /* Set up the stack pointer */
3673 +       movl $(init_thread_union+THREAD_SIZE),%esp
3674 +
3675 +       /* get vendor info */
3676 +       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
3677 +       XEN_CPUID
3678 +       movl %eax,X86_CPUID             # save CPUID level
3679 +       movl %ebx,X86_VENDOR_ID         # lo 4 chars
3680 +       movl %edx,X86_VENDOR_ID+4       # next 4 chars
3681 +       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
3682 +
3683 +       movl $1,%eax            # Use the CPUID instruction to get CPU type
3684 +       XEN_CPUID
3685 +       movb %al,%cl            # save reg for future use
3686 +       andb $0x0f,%ah          # mask processor family
3687 +       movb %ah,X86
3688 +       andb $0xf0,%al          # mask model
3689 +       shrb $4,%al
3690 +       movb %al,X86_MODEL
3691 +       andb $0x0f,%cl          # mask mask revision
3692 +       movb %cl,X86_MASK
3693 +       movl %edx,X86_CAPABILITY
3694 +
3695 +       movb $1,X86_HARD_MATH
3696 +
3697 +       xorl %eax,%eax                  # Clear FS/GS and LDT
3698 +       movl %eax,%fs
3699 +       movl %eax,%gs
3700 +       cld                     # gcc2 wants the direction flag cleared at all times
3701 +
3702 +       call start_kernel
3703 +L6:
3704 +       jmp L6                  # main should never return here, but
3705 +                               # just in case, we know what happens.
3706 +
3707 +#define HYPERCALL_PAGE_OFFSET 0x1000
3708 +.org HYPERCALL_PAGE_OFFSET
3709 +ENTRY(hypercall_page)
3710 +.skip 0x1000
3711 +
3712 +/*
3713 + * Real beginning of normal "text" segment
3714 + */
3715 +ENTRY(stext)
3716 +ENTRY(_stext)
3717 +
3718 +/*
3719 + * BSS section
3720 + */
3721 +.section ".bss.page_aligned","w"
3722 +ENTRY(empty_zero_page)
3723 +       .fill 4096,1,0
3724 +
3725 +/*
3726 + * This starts the data section.
3727 + */
3728 +.data
3729 +
3730 +/*
3731 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
3732 + */
3733 +ENTRY(cpu_gdt_table)
3734 +       .quad 0x0000000000000000        /* NULL descriptor */
3735 +       .quad 0x0000000000000000        /* 0x0b reserved */
3736 +       .quad 0x0000000000000000        /* 0x13 reserved */
3737 +       .quad 0x0000000000000000        /* 0x1b reserved */
3738 +       .quad 0x0000000000000000        /* 0x20 unused */
3739 +       .quad 0x0000000000000000        /* 0x28 unused */
3740 +       .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
3741 +       .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
3742 +       .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
3743 +       .quad 0x0000000000000000        /* 0x4b reserved */
3744 +       .quad 0x0000000000000000        /* 0x53 reserved */
3745 +       .quad 0x0000000000000000        /* 0x5b reserved */
3746 +
3747 +       .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
3748 +       .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
3749 +       .quad 0x00cffa000000ffff        /* 0x73 user 4GB code at 0x00000000 */
3750 +       .quad 0x00cff2000000ffff        /* 0x7b user 4GB data at 0x00000000 */
3751 +
3752 +       .quad 0x0000000000000000        /* 0x80 TSS descriptor */
3753 +       .quad 0x0000000000000000        /* 0x88 LDT descriptor */
3754 +
3755 +       /*
3756 +        * Segments used for calling PnP BIOS have byte granularity.
3757 +        * They code segments and data segments have fixed 64k limits,
3758 +        * the transfer segment sizes are set at run time.
3759 +        */
3760 +       .quad 0x0000000000000000        /* 0x90 32-bit code */
3761 +       .quad 0x0000000000000000        /* 0x98 16-bit code */
3762 +       .quad 0x0000000000000000        /* 0xa0 16-bit data */
3763 +       .quad 0x0000000000000000        /* 0xa8 16-bit data */
3764 +       .quad 0x0000000000000000        /* 0xb0 16-bit data */
3765 +
3766 +       /*
3767 +        * The APM segments have byte granularity and their bases
3768 +        * are set at run time.  All have 64k limits.
3769 +        */
3770 +       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
3771 +       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
3772 +       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
3773 +
3774 +       .quad 0x0000000000000000        /* 0xd0 - ESPFIX 16-bit SS */
3775 +       .quad 0x0000000000000000        /* 0xd8 - unused */
3776 +       .quad 0x0000000000000000        /* 0xe0 - unused */
3777 +       .quad 0x0000000000000000        /* 0xe8 - unused */
3778 +       .quad 0x0000000000000000        /* 0xf0 - unused */
3779 +       .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault TSS */
3780 +
3781 +/*
3782 + * __xen_guest information
3783 + */
3784 +.macro utoa value
3785 + .if (\value) < 0 || (\value) >= 0x10
3786 +       utoa (((\value)>>4)&0x0fffffff)
3787 + .endif
3788 + .if ((\value) & 0xf) < 10
3789 +  .byte '0' + ((\value) & 0xf)
3790 + .else
3791 +  .byte 'A' + ((\value) & 0xf) - 10
3792 + .endif
3793 +.endm
3794 +
3795 +.section __xen_guest
3796 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
3797 +       .ascii  ",XEN_VER=xen-3.0"
3798 +       .ascii  ",VIRT_BASE=0x"
3799 +               utoa __PAGE_OFFSET
3800 +       .ascii  ",HYPERCALL_PAGE=0x"
3801 +               utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
3802 +       .ascii  ",FEATURES=writable_page_tables"
3803 +       .ascii           "|writable_descriptor_tables"
3804 +       .ascii           "|auto_translated_physmap"
3805 +       .ascii           "|pae_pgdir_above_4gb"
3806 +       .ascii           "|supervisor_mode_kernel"
3807 +#ifdef CONFIG_X86_PAE
3808 +       .ascii  ",PAE=yes"
3809 +#else
3810 +       .ascii  ",PAE=no"
3811 +#endif
3812 +       .ascii  ",LOADER=generic"
3813 +       .byte   0
3814 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/init_task-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/init_task-xen.c
3815 --- ref-linux-2.6.16.9/arch/i386/kernel/init_task-xen.c 1970-01-01 01:00:00.000000000 +0100
3816 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/init_task-xen.c    2006-04-10 00:05:52.000000000 +0200
3817 @@ -0,0 +1,51 @@
3818 +#include <linux/mm.h>
3819 +#include <linux/module.h>
3820 +#include <linux/sched.h>
3821 +#include <linux/init.h>
3822 +#include <linux/init_task.h>
3823 +#include <linux/fs.h>
3824 +#include <linux/mqueue.h>
3825 +
3826 +#include <asm/uaccess.h>
3827 +#include <asm/pgtable.h>
3828 +#include <asm/desc.h>
3829 +
3830 +static struct fs_struct init_fs = INIT_FS;
3831 +static struct files_struct init_files = INIT_FILES;
3832 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
3833 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
3834 +
3835 +#define swapper_pg_dir ((pgd_t *)NULL)
3836 +struct mm_struct init_mm = INIT_MM(init_mm);
3837 +#undef swapper_pg_dir
3838 +
3839 +EXPORT_SYMBOL(init_mm);
3840 +
3841 +/*
3842 + * Initial thread structure.
3843 + *
3844 + * We need to make sure that this is THREAD_SIZE aligned due to the
3845 + * way process stacks are handled. This is done by having a special
3846 + * "init_task" linker map entry..
3847 + */
3848 +union thread_union init_thread_union 
3849 +       __attribute__((__section__(".data.init_task"))) =
3850 +               { INIT_THREAD_INFO(init_task) };
3851 +
3852 +/*
3853 + * Initial task structure.
3854 + *
3855 + * All other task structs will be allocated on slabs in fork.c
3856 + */
3857 +struct task_struct init_task = INIT_TASK(init_task);
3858 +
3859 +EXPORT_SYMBOL(init_task);
3860 +
3861 +#ifndef CONFIG_X86_NO_TSS
3862 +/*
3863 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
3864 + * no more per-task TSS's.
3865 + */ 
3866 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
3867 +#endif
3868 +
3869 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/io_apic-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/io_apic-xen.c
3870 --- ref-linux-2.6.16.9/arch/i386/kernel/io_apic-xen.c   1970-01-01 01:00:00.000000000 +0100
3871 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/io_apic-xen.c      2006-04-10 00:05:52.000000000 +0200
3872 @@ -0,0 +1,2747 @@
3873 +/*
3874 + *     Intel IO-APIC support for multi-Pentium hosts.
3875 + *
3876 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
3877 + *
3878 + *     Many thanks to Stig Venaas for trying out countless experimental
3879 + *     patches and reporting/debugging problems patiently!
3880 + *
3881 + *     (c) 1999, Multiple IO-APIC support, developed by
3882 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
3883 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
3884 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
3885 + *     and Ingo Molnar <mingo@redhat.com>
3886 + *
3887 + *     Fixes
3888 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
3889 + *                                     thanks to Eric Gilmore
3890 + *                                     and Rolf G. Tews
3891 + *                                     for testing these extensively
3892 + *     Paul Diefenbaugh        :       Added full ACPI support
3893 + */
3894 +
3895 +#include <linux/mm.h>
3896 +#include <linux/interrupt.h>
3897 +#include <linux/init.h>
3898 +#include <linux/delay.h>
3899 +#include <linux/sched.h>
3900 +#include <linux/config.h>
3901 +#include <linux/smp_lock.h>
3902 +#include <linux/mc146818rtc.h>
3903 +#include <linux/compiler.h>
3904 +#include <linux/acpi.h>
3905 +#include <linux/module.h>
3906 +#include <linux/sysdev.h>
3907 +
3908 +#include <asm/io.h>
3909 +#include <asm/smp.h>
3910 +#include <asm/desc.h>
3911 +#include <asm/timer.h>
3912 +#include <asm/i8259.h>
3913 +
3914 +#include <mach_apic.h>
3915 +
3916 +#include "io_ports.h"
3917 +
3918 +#ifdef CONFIG_XEN
3919 +
3920 +#include <xen/interface/xen.h>
3921 +#include <xen/interface/physdev.h>
3922 +
3923 +/* Fake i8259 */
3924 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
3925 +#define disable_8259A_irq(_irq)  ((void)0)
3926 +#define i8259A_irq_pending(_irq) (0)
3927 +
3928 +unsigned long io_apic_irqs;
3929 +
3930 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
3931 +{
3932 +       physdev_op_t op;
3933 +       int ret;
3934 +
3935 +       op.cmd = PHYSDEVOP_APIC_READ;
3936 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3937 +       op.u.apic_op.reg = reg;
3938 +       ret = HYPERVISOR_physdev_op(&op);
3939 +       if (ret)
3940 +               return ret;
3941 +       return op.u.apic_op.value;
3942 +}
3943 +
3944 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
3945 +{
3946 +       physdev_op_t op;
3947 +
3948 +       op.cmd = PHYSDEVOP_APIC_WRITE;
3949 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
3950 +       op.u.apic_op.reg = reg;
3951 +       op.u.apic_op.value = value;
3952 +       HYPERVISOR_physdev_op(&op);
3953 +}
3954 +
3955 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
3956 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
3957 +
3958 +#endif /* CONFIG_XEN */
3959 +
3960 +int (*ioapic_renumber_irq)(int ioapic, int irq);
3961 +atomic_t irq_mis_count;
3962 +
3963 +/* Where if anywhere is the i8259 connect in external int mode */
3964 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
3965 +
3966 +static DEFINE_SPINLOCK(ioapic_lock);
3967 +
3968 +int timer_over_8254 __initdata = 1;
3969 +
3970 +/*
3971 + *     Is the SiS APIC rmw bug present ?
3972 + *     -1 = don't know, 0 = no, 1 = yes
3973 + */
3974 +int sis_apic_bug = -1;
3975 +
3976 +/*
3977 + * # of IRQ routing registers
3978 + */
3979 +int nr_ioapic_registers[MAX_IO_APICS];
3980 +
3981 +int disable_timer_pin_1 __initdata;
3982 +
3983 +/*
3984 + * Rough estimation of how many shared IRQs there are, can
3985 + * be changed anytime.
3986 + */
3987 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
3988 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
3989 +
3990 +/*
3991 + * This is performance-critical, we want to do it O(1)
3992 + *
3993 + * the indexing order of this array favors 1:1 mappings
3994 + * between pins and IRQs.
3995 + */
3996 +
3997 +static struct irq_pin_list {
3998 +       int apic, pin, next;
3999 +} irq_2_pin[PIN_MAP_SIZE];
4000 +
4001 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
4002 +#ifdef CONFIG_PCI_MSI
4003 +#define vector_to_irq(vector)  \
4004 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
4005 +#else
4006 +#define vector_to_irq(vector)  (vector)
4007 +#endif
4008 +
4009 +/*
4010 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
4011 + * shared ISA-space IRQs, so we have to support them. We are super
4012 + * fast in the common case, and fast for shared ISA-space IRQs.
4013 + */
4014 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
4015 +{
4016 +       static int first_free_entry = NR_IRQS;
4017 +       struct irq_pin_list *entry = irq_2_pin + irq;
4018 +
4019 +       while (entry->next)
4020 +               entry = irq_2_pin + entry->next;
4021 +
4022 +       if (entry->pin != -1) {
4023 +               entry->next = first_free_entry;
4024 +               entry = irq_2_pin + entry->next;
4025 +               if (++first_free_entry >= PIN_MAP_SIZE)
4026 +                       panic("io_apic.c: whoops");
4027 +       }
4028 +       entry->apic = apic;
4029 +       entry->pin = pin;
4030 +}
4031 +
4032 +#ifdef CONFIG_XEN
4033 +#define clear_IO_APIC() ((void)0)
4034 +#else
4035 +/*
4036 + * Reroute an IRQ to a different pin.
4037 + */
4038 +static void __init replace_pin_at_irq(unsigned int irq,
4039 +                                     int oldapic, int oldpin,
4040 +                                     int newapic, int newpin)
4041 +{
4042 +       struct irq_pin_list *entry = irq_2_pin + irq;
4043 +
4044 +       while (1) {
4045 +               if (entry->apic == oldapic && entry->pin == oldpin) {
4046 +                       entry->apic = newapic;
4047 +                       entry->pin = newpin;
4048 +               }
4049 +               if (!entry->next)
4050 +                       break;
4051 +               entry = irq_2_pin + entry->next;
4052 +       }
4053 +}
4054 +
4055 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
4056 +{
4057 +       struct irq_pin_list *entry = irq_2_pin + irq;
4058 +       unsigned int pin, reg;
4059 +
4060 +       for (;;) {
4061 +               pin = entry->pin;
4062 +               if (pin == -1)
4063 +                       break;
4064 +               reg = io_apic_read(entry->apic, 0x10 + pin*2);
4065 +               reg &= ~disable;
4066 +               reg |= enable;
4067 +               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
4068 +               if (!entry->next)
4069 +                       break;
4070 +               entry = irq_2_pin + entry->next;
4071 +       }
4072 +}
4073 +
4074 +/* mask = 1 */
4075 +static void __mask_IO_APIC_irq (unsigned int irq)
4076 +{
4077 +       __modify_IO_APIC_irq(irq, 0x00010000, 0);
4078 +}
4079 +
4080 +/* mask = 0 */
4081 +static void __unmask_IO_APIC_irq (unsigned int irq)
4082 +{
4083 +       __modify_IO_APIC_irq(irq, 0, 0x00010000);
4084 +}
4085 +
4086 +/* mask = 1, trigger = 0 */
4087 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
4088 +{
4089 +       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
4090 +}
4091 +
4092 +/* mask = 0, trigger = 1 */
4093 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
4094 +{
4095 +       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
4096 +}
4097 +
4098 +static void mask_IO_APIC_irq (unsigned int irq)
4099 +{
4100 +       unsigned long flags;
4101 +
4102 +       spin_lock_irqsave(&ioapic_lock, flags);
4103 +       __mask_IO_APIC_irq(irq);
4104 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4105 +}
4106 +
4107 +static void unmask_IO_APIC_irq (unsigned int irq)
4108 +{
4109 +       unsigned long flags;
4110 +
4111 +       spin_lock_irqsave(&ioapic_lock, flags);
4112 +       __unmask_IO_APIC_irq(irq);
4113 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4114 +}
4115 +
4116 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
4117 +{
4118 +       struct IO_APIC_route_entry entry;
4119 +       unsigned long flags;
4120 +       
4121 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
4122 +       spin_lock_irqsave(&ioapic_lock, flags);
4123 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4124 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4125 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4126 +       if (entry.delivery_mode == dest_SMI)
4127 +               return;
4128 +
4129 +       /*
4130 +        * Disable it in the IO-APIC irq-routing table:
4131 +        */
4132 +       memset(&entry, 0, sizeof(entry));
4133 +       entry.mask = 1;
4134 +       spin_lock_irqsave(&ioapic_lock, flags);
4135 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
4136 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
4137 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4138 +}
4139 +
4140 +static void clear_IO_APIC (void)
4141 +{
4142 +       int apic, pin;
4143 +
4144 +       for (apic = 0; apic < nr_ioapics; apic++)
4145 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
4146 +                       clear_IO_APIC_pin(apic, pin);
4147 +}
4148 +
4149 +#ifdef CONFIG_SMP
4150 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
4151 +{
4152 +       unsigned long flags;
4153 +       int pin;
4154 +       struct irq_pin_list *entry = irq_2_pin + irq;
4155 +       unsigned int apicid_value;
4156 +       cpumask_t tmp;
4157 +       
4158 +       cpus_and(tmp, cpumask, cpu_online_map);
4159 +       if (cpus_empty(tmp))
4160 +               tmp = TARGET_CPUS;
4161 +
4162 +       cpus_and(cpumask, tmp, CPU_MASK_ALL);
4163 +
4164 +       apicid_value = cpu_mask_to_apicid(cpumask);
4165 +       /* Prepare to do the io_apic_write */
4166 +       apicid_value = apicid_value << 24;
4167 +       spin_lock_irqsave(&ioapic_lock, flags);
4168 +       for (;;) {
4169 +               pin = entry->pin;
4170 +               if (pin == -1)
4171 +                       break;
4172 +               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
4173 +               if (!entry->next)
4174 +                       break;
4175 +               entry = irq_2_pin + entry->next;
4176 +       }
4177 +       set_irq_info(irq, cpumask);
4178 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4179 +}
4180 +
4181 +#if defined(CONFIG_IRQBALANCE)
4182 +# include <asm/processor.h>    /* kernel_thread() */
4183 +# include <linux/kernel_stat.h>        /* kstat */
4184 +# include <linux/slab.h>               /* kmalloc() */
4185 +# include <linux/timer.h>      /* time_after() */
4186
4187 +# ifdef CONFIG_BALANCED_IRQ_DEBUG
4188 +#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
4189 +#  define Dprintk(x...) do { TDprintk(x); } while (0)
4190 +# else
4191 +#  define TDprintk(x...) 
4192 +#  define Dprintk(x...) 
4193 +# endif
4194 +
4195 +
4196 +#define IRQBALANCE_CHECK_ARCH -999
4197 +static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
4198 +static int physical_balance = 0;
4199 +
4200 +static struct irq_cpu_info {
4201 +       unsigned long * last_irq;
4202 +       unsigned long * irq_delta;
4203 +       unsigned long irq;
4204 +} irq_cpu_data[NR_CPUS];
4205 +
4206 +#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
4207 +#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
4208 +#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
4209 +
4210 +#define IDLE_ENOUGH(cpu,now) \
4211 +       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
4212 +
4213 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
4214 +
4215 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
4216 +
4217 +#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
4218 +#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
4219 +#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
4220 +#define BALANCED_IRQ_LESS_DELTA                (HZ)
4221 +
4222 +static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
4223 +
4224 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
4225 +                       unsigned long now, int direction)
4226 +{
4227 +       int search_idle = 1;
4228 +       int cpu = curr_cpu;
4229 +
4230 +       goto inside;
4231 +
4232 +       do {
4233 +               if (unlikely(cpu == curr_cpu))
4234 +                       search_idle = 0;
4235 +inside:
4236 +               if (direction == 1) {
4237 +                       cpu++;
4238 +                       if (cpu >= NR_CPUS)
4239 +                               cpu = 0;
4240 +               } else {
4241 +                       cpu--;
4242 +                       if (cpu == -1)
4243 +                               cpu = NR_CPUS-1;
4244 +               }
4245 +       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
4246 +                       (search_idle && !IDLE_ENOUGH(cpu,now)));
4247 +
4248 +       return cpu;
4249 +}
4250 +
4251 +static inline void balance_irq(int cpu, int irq)
4252 +{
4253 +       unsigned long now = jiffies;
4254 +       cpumask_t allowed_mask;
4255 +       unsigned int new_cpu;
4256 +               
4257 +       if (irqbalance_disabled)
4258 +               return; 
4259 +
4260 +       cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
4261 +       new_cpu = move(cpu, allowed_mask, now, 1);
4262 +       if (cpu != new_cpu) {
4263 +               set_pending_irq(irq, cpumask_of_cpu(new_cpu));
4264 +       }
4265 +}
4266 +
4267 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
4268 +{
4269 +       int i, j;
4270 +       Dprintk("Rotating IRQs among CPUs.\n");
4271 +       for (i = 0; i < NR_CPUS; i++) {
4272 +               for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
4273 +                       if (!irq_desc[j].action)
4274 +                               continue;
4275 +                       /* Is it a significant load ?  */
4276 +                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
4277 +                                               useful_load_threshold)
4278 +                               continue;
4279 +                       balance_irq(i, j);
4280 +               }
4281 +       }
4282 +       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4283 +               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
4284 +       return;
4285 +}
4286 +
4287 +static void do_irq_balance(void)
4288 +{
4289 +       int i, j;
4290 +       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
4291 +       unsigned long move_this_load = 0;
4292 +       int max_loaded = 0, min_loaded = 0;
4293 +       int load;
4294 +       unsigned long useful_load_threshold = balanced_irq_interval + 10;
4295 +       int selected_irq;
4296 +       int tmp_loaded, first_attempt = 1;
4297 +       unsigned long tmp_cpu_irq;
4298 +       unsigned long imbalance = 0;
4299 +       cpumask_t allowed_mask, target_cpu_mask, tmp;
4300 +
4301 +       for (i = 0; i < NR_CPUS; i++) {
4302 +               int package_index;
4303 +               CPU_IRQ(i) = 0;
4304 +               if (!cpu_online(i))
4305 +                       continue;
4306 +               package_index = CPU_TO_PACKAGEINDEX(i);
4307 +               for (j = 0; j < NR_IRQS; j++) {
4308 +                       unsigned long value_now, delta;
4309 +                       /* Is this an active IRQ? */
4310 +                       if (!irq_desc[j].action)
4311 +                               continue;
4312 +                       if ( package_index == i )
4313 +                               IRQ_DELTA(package_index,j) = 0;
4314 +                       /* Determine the total count per processor per IRQ */
4315 +                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
4316 +
4317 +                       /* Determine the activity per processor per IRQ */
4318 +                       delta = value_now - LAST_CPU_IRQ(i,j);
4319 +
4320 +                       /* Update last_cpu_irq[][] for the next time */
4321 +                       LAST_CPU_IRQ(i,j) = value_now;
4322 +
4323 +                       /* Ignore IRQs whose rate is less than the clock */
4324 +                       if (delta < useful_load_threshold)
4325 +                               continue;
4326 +                       /* update the load for the processor or package total */
4327 +                       IRQ_DELTA(package_index,j) += delta;
4328 +
4329 +                       /* Keep track of the higher numbered sibling as well */
4330 +                       if (i != package_index)
4331 +                               CPU_IRQ(i) += delta;
4332 +                       /*
4333 +                        * We have sibling A and sibling B in the package
4334 +                        *
4335 +                        * cpu_irq[A] = load for cpu A + load for cpu B
4336 +                        * cpu_irq[B] = load for cpu B
4337 +                        */
4338 +                       CPU_IRQ(package_index) += delta;
4339 +               }
4340 +       }
4341 +       /* Find the least loaded processor package */
4342 +       for (i = 0; i < NR_CPUS; i++) {
4343 +               if (!cpu_online(i))
4344 +                       continue;
4345 +               if (i != CPU_TO_PACKAGEINDEX(i))
4346 +                       continue;
4347 +               if (min_cpu_irq > CPU_IRQ(i)) {
4348 +                       min_cpu_irq = CPU_IRQ(i);
4349 +                       min_loaded = i;
4350 +               }
4351 +       }
4352 +       max_cpu_irq = ULONG_MAX;
4353 +
4354 +tryanothercpu:
4355 +       /* Look for heaviest loaded processor.
4356 +        * We may come back to get the next heaviest loaded processor.
4357 +        * Skip processors with trivial loads.
4358 +        */
4359 +       tmp_cpu_irq = 0;
4360 +       tmp_loaded = -1;
4361 +       for (i = 0; i < NR_CPUS; i++) {
4362 +               if (!cpu_online(i))
4363 +                       continue;
4364 +               if (i != CPU_TO_PACKAGEINDEX(i))
4365 +                       continue;
4366 +               if (max_cpu_irq <= CPU_IRQ(i)) 
4367 +                       continue;
4368 +               if (tmp_cpu_irq < CPU_IRQ(i)) {
4369 +                       tmp_cpu_irq = CPU_IRQ(i);
4370 +                       tmp_loaded = i;
4371 +               }
4372 +       }
4373 +
4374 +       if (tmp_loaded == -1) {
4375 +        /* In the case of small number of heavy interrupt sources, 
4376 +         * loading some of the cpus too much. We use Ingo's original 
4377 +         * approach to rotate them around.
4378 +         */
4379 +               if (!first_attempt && imbalance >= useful_load_threshold) {
4380 +                       rotate_irqs_among_cpus(useful_load_threshold);
4381 +                       return;
4382 +               }
4383 +               goto not_worth_the_effort;
4384 +       }
4385 +       
4386 +       first_attempt = 0;              /* heaviest search */
4387 +       max_cpu_irq = tmp_cpu_irq;      /* load */
4388 +       max_loaded = tmp_loaded;        /* processor */
4389 +       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
4390 +       
4391 +       Dprintk("max_loaded cpu = %d\n", max_loaded);
4392 +       Dprintk("min_loaded cpu = %d\n", min_loaded);
4393 +       Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
4394 +       Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
4395 +       Dprintk("load imbalance = %lu\n", imbalance);
4396 +
4397 +       /* if imbalance is less than approx 10% of max load, then
4398 +        * observe diminishing returns action. - quit
4399 +        */
4400 +       if (imbalance < (max_cpu_irq >> 3)) {
4401 +               Dprintk("Imbalance too trivial\n");
4402 +               goto not_worth_the_effort;
4403 +       }
4404 +
4405 +tryanotherirq:
4406 +       /* if we select an IRQ to move that can't go where we want, then
4407 +        * see if there is another one to try.
4408 +        */
4409 +       move_this_load = 0;
4410 +       selected_irq = -1;
4411 +       for (j = 0; j < NR_IRQS; j++) {
4412 +               /* Is this an active IRQ? */
4413 +               if (!irq_desc[j].action)
4414 +                       continue;
4415 +               if (imbalance <= IRQ_DELTA(max_loaded,j))
4416 +                       continue;
4417 +               /* Try to find the IRQ that is closest to the imbalance
4418 +                * without going over.
4419 +                */
4420 +               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
4421 +                       move_this_load = IRQ_DELTA(max_loaded,j);
4422 +                       selected_irq = j;
4423 +               }
4424 +       }
4425 +       if (selected_irq == -1) {
4426 +               goto tryanothercpu;
4427 +       }
4428 +
4429 +       imbalance = move_this_load;
4430 +       
4431 +       /* For physical_balance case, we accumlated both load
4432 +        * values in the one of the siblings cpu_irq[],
4433 +        * to use the same code for physical and logical processors
4434 +        * as much as possible. 
4435 +        *
4436 +        * NOTE: the cpu_irq[] array holds the sum of the load for
4437 +        * sibling A and sibling B in the slot for the lowest numbered
4438 +        * sibling (A), _AND_ the load for sibling B in the slot for
4439 +        * the higher numbered sibling.
4440 +        *
4441 +        * We seek the least loaded sibling by making the comparison
4442 +        * (A+B)/2 vs B
4443 +        */
4444 +       load = CPU_IRQ(min_loaded) >> 1;
4445 +       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
4446 +               if (load > CPU_IRQ(j)) {
4447 +                       /* This won't change cpu_sibling_map[min_loaded] */
4448 +                       load = CPU_IRQ(j);
4449 +                       min_loaded = j;
4450 +               }
4451 +       }
4452 +
4453 +       cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
4454 +       target_cpu_mask = cpumask_of_cpu(min_loaded);
4455 +       cpus_and(tmp, target_cpu_mask, allowed_mask);
4456 +
4457 +       if (!cpus_empty(tmp)) {
4458 +
4459 +               Dprintk("irq = %d moved to cpu = %d\n",
4460 +                               selected_irq, min_loaded);
4461 +               /* mark for change destination */
4462 +               set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
4463 +
4464 +               /* Since we made a change, come back sooner to 
4465 +                * check for more variation.
4466 +                */
4467 +               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4468 +                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
4469 +               return;
4470 +       }
4471 +       goto tryanotherirq;
4472 +
4473 +not_worth_the_effort:
4474 +       /*
4475 +        * if we did not find an IRQ to move, then adjust the time interval
4476 +        * upward
4477 +        */
4478 +       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
4479 +               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
4480 +       Dprintk("IRQ worth rotating not found\n");
4481 +       return;
4482 +}
4483 +
4484 +static int balanced_irq(void *unused)
4485 +{
4486 +       int i;
4487 +       unsigned long prev_balance_time = jiffies;
4488 +       long time_remaining = balanced_irq_interval;
4489 +
4490 +       daemonize("kirqd");
4491 +       
4492 +       /* push everything to CPU 0 to give us a starting point.  */
4493 +       for (i = 0 ; i < NR_IRQS ; i++) {
4494 +               pending_irq_cpumask[i] = cpumask_of_cpu(0);
4495 +               set_pending_irq(i, cpumask_of_cpu(0));
4496 +       }
4497 +
4498 +       for ( ; ; ) {
4499 +               time_remaining = schedule_timeout_interruptible(time_remaining);
4500 +               try_to_freeze();
4501 +               if (time_after(jiffies,
4502 +                               prev_balance_time+balanced_irq_interval)) {
4503 +                       preempt_disable();
4504 +                       do_irq_balance();
4505 +                       prev_balance_time = jiffies;
4506 +                       time_remaining = balanced_irq_interval;
4507 +                       preempt_enable();
4508 +               }
4509 +       }
4510 +       return 0;
4511 +}
4512 +
4513 +static int __init balanced_irq_init(void)
4514 +{
4515 +       int i;
4516 +       struct cpuinfo_x86 *c;
4517 +       cpumask_t tmp;
4518 +
4519 +       cpus_shift_right(tmp, cpu_online_map, 2);
4520 +        c = &boot_cpu_data;
4521 +       /* When not overwritten by the command line ask subarchitecture. */
4522 +       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
4523 +               irqbalance_disabled = NO_BALANCE_IRQ;
4524 +       if (irqbalance_disabled)
4525 +               return 0;
4526 +       
4527 +        /* disable irqbalance completely if there is only one processor online */
4528 +       if (num_online_cpus() < 2) {
4529 +               irqbalance_disabled = 1;
4530 +               return 0;
4531 +       }
4532 +       /*
4533 +        * Enable physical balance only if more than 1 physical processor
4534 +        * is present
4535 +        */
4536 +       if (smp_num_siblings > 1 && !cpus_empty(tmp))
4537 +               physical_balance = 1;
4538 +
4539 +       for (i = 0; i < NR_CPUS; i++) {
4540 +               if (!cpu_online(i))
4541 +                       continue;
4542 +               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4543 +               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4544 +               if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
4545 +                       printk(KERN_ERR "balanced_irq_init: out of memory");
4546 +                       goto failed;
4547 +               }
4548 +               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
4549 +               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
4550 +       }
4551 +       
4552 +       printk(KERN_INFO "Starting balanced_irq\n");
4553 +       if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
4554 +               return 0;
4555 +       else 
4556 +               printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
4557 +failed:
4558 +       for (i = 0; i < NR_CPUS; i++) {
4559 +               kfree(irq_cpu_data[i].irq_delta);
4560 +               kfree(irq_cpu_data[i].last_irq);
4561 +       }
4562 +       return 0;
4563 +}
4564 +
4565 +int __init irqbalance_disable(char *str)
4566 +{
4567 +       irqbalance_disabled = 1;
4568 +       return 0;
4569 +}
4570 +
4571 +__setup("noirqbalance", irqbalance_disable);
4572 +
4573 +late_initcall(balanced_irq_init);
4574 +#endif /* CONFIG_IRQBALANCE */
4575 +#endif /* CONFIG_SMP */
4576 +#endif
4577 +
4578 +#ifndef CONFIG_SMP
4579 +void fastcall send_IPI_self(int vector)
4580 +{
4581 +#ifndef CONFIG_XEN
4582 +       unsigned int cfg;
4583 +
4584 +       /*
4585 +        * Wait for idle.
4586 +        */
4587 +       apic_wait_icr_idle();
4588 +       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
4589 +       /*
4590 +        * Send the IPI. The write to APIC_ICR fires this off.
4591 +        */
4592 +       apic_write_around(APIC_ICR, cfg);
4593 +#endif
4594 +}
4595 +#endif /* !CONFIG_SMP */
4596 +
4597 +
4598 +/*
4599 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
4600 + * specific CPU-side IRQs.
4601 + */
4602 +
4603 +#define MAX_PIRQS 8
4604 +static int pirq_entries [MAX_PIRQS];
4605 +static int pirqs_enabled;
4606 +int skip_ioapic_setup;
4607 +
4608 +static int __init ioapic_setup(char *str)
4609 +{
4610 +       skip_ioapic_setup = 1;
4611 +       return 1;
4612 +}
4613 +
4614 +__setup("noapic", ioapic_setup);
4615 +
4616 +static int __init ioapic_pirq_setup(char *str)
4617 +{
4618 +       int i, max;
4619 +       int ints[MAX_PIRQS+1];
4620 +
4621 +       get_options(str, ARRAY_SIZE(ints), ints);
4622 +
4623 +       for (i = 0; i < MAX_PIRQS; i++)
4624 +               pirq_entries[i] = -1;
4625 +
4626 +       pirqs_enabled = 1;
4627 +       apic_printk(APIC_VERBOSE, KERN_INFO
4628 +                       "PIRQ redirection, working around broken MP-BIOS.\n");
4629 +       max = MAX_PIRQS;
4630 +       if (ints[0] < MAX_PIRQS)
4631 +               max = ints[0];
4632 +
4633 +       for (i = 0; i < max; i++) {
4634 +               apic_printk(APIC_VERBOSE, KERN_DEBUG
4635 +                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
4636 +               /*
4637 +                * PIRQs are mapped upside down, usually.
4638 +                */
4639 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
4640 +       }
4641 +       return 1;
4642 +}
4643 +
4644 +__setup("pirq=", ioapic_pirq_setup);
4645 +
4646 +/*
4647 + * Find the IRQ entry number of a certain pin.
4648 + */
4649 +static int find_irq_entry(int apic, int pin, int type)
4650 +{
4651 +       int i;
4652 +
4653 +       for (i = 0; i < mp_irq_entries; i++)
4654 +               if (mp_irqs[i].mpc_irqtype == type &&
4655 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
4656 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
4657 +                   mp_irqs[i].mpc_dstirq == pin)
4658 +                       return i;
4659 +
4660 +       return -1;
4661 +}
4662 +
4663 +/*
4664 + * Find the pin to which IRQ[irq] (ISA) is connected
4665 + */
4666 +static int __init find_isa_irq_pin(int irq, int type)
4667 +{
4668 +       int i;
4669 +
4670 +       for (i = 0; i < mp_irq_entries; i++) {
4671 +               int lbus = mp_irqs[i].mpc_srcbus;
4672 +
4673 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4674 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4675 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4676 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4677 +                   ) &&
4678 +                   (mp_irqs[i].mpc_irqtype == type) &&
4679 +                   (mp_irqs[i].mpc_srcbusirq == irq))
4680 +
4681 +                       return mp_irqs[i].mpc_dstirq;
4682 +       }
4683 +       return -1;
4684 +}
4685 +
4686 +static int __init find_isa_irq_apic(int irq, int type)
4687 +{
4688 +       int i;
4689 +
4690 +       for (i = 0; i < mp_irq_entries; i++) {
4691 +               int lbus = mp_irqs[i].mpc_srcbus;
4692 +
4693 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4694 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4695 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4696 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4697 +                   ) &&
4698 +                   (mp_irqs[i].mpc_irqtype == type) &&
4699 +                   (mp_irqs[i].mpc_srcbusirq == irq))
4700 +                       break;
4701 +       }
4702 +       if (i < mp_irq_entries) {
4703 +               int apic;
4704 +               for(apic = 0; apic < nr_ioapics; apic++) {
4705 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
4706 +                               return apic;
4707 +               }
4708 +       }
4709 +
4710 +       return -1;
4711 +}
4712 +
4713 +/*
4714 + * Find a specific PCI IRQ entry.
4715 + * Not an __init, possibly needed by modules
4716 + */
4717 +static int pin_2_irq(int idx, int apic, int pin);
4718 +
4719 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
4720 +{
4721 +       int apic, i, best_guess = -1;
4722 +
4723 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4724 +               "slot:%d, pin:%d.\n", bus, slot, pin);
4725 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
4726 +               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4727 +               return -1;
4728 +       }
4729 +       for (i = 0; i < mp_irq_entries; i++) {
4730 +               int lbus = mp_irqs[i].mpc_srcbus;
4731 +
4732 +               for (apic = 0; apic < nr_ioapics; apic++)
4733 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4734 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4735 +                               break;
4736 +
4737 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4738 +                   !mp_irqs[i].mpc_irqtype &&
4739 +                   (bus == lbus) &&
4740 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4741 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4742 +
4743 +                       if (!(apic || IO_APIC_IRQ(irq)))
4744 +                               continue;
4745 +
4746 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
4747 +                               return irq;
4748 +                       /*
4749 +                        * Use the first all-but-pin matching entry as a
4750 +                        * best-guess fuzzy result for broken mptables.
4751 +                        */
4752 +                       if (best_guess < 0)
4753 +                               best_guess = irq;
4754 +               }
4755 +       }
4756 +       return best_guess;
4757 +}
4758 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
4759 +
4760 +/*
4761 + * This function currently is only a helper for the i386 smp boot process where 
4762 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
4763 + * so mask in all cases should simply be TARGET_CPUS
4764 + */
4765 +#ifdef CONFIG_SMP
4766 +#ifndef CONFIG_XEN
4767 +void __init setup_ioapic_dest(void)
4768 +{
4769 +       int pin, ioapic, irq, irq_entry;
4770 +
4771 +       if (skip_ioapic_setup == 1)
4772 +               return;
4773 +
4774 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
4775 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4776 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4777 +                       if (irq_entry == -1)
4778 +                               continue;
4779 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
4780 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
4781 +               }
4782 +
4783 +       }
4784 +}
4785 +#endif /* !CONFIG_XEN */
4786 +#endif
4787 +
4788 +/*
4789 + * EISA Edge/Level control register, ELCR
4790 + */
4791 +static int EISA_ELCR(unsigned int irq)
4792 +{
4793 +       if (irq < 16) {
4794 +               unsigned int port = 0x4d0 + (irq >> 3);
4795 +               return (inb(port) >> (irq & 7)) & 1;
4796 +       }
4797 +       apic_printk(APIC_VERBOSE, KERN_INFO
4798 +                       "Broken MPtable reports ISA irq %d\n", irq);
4799 +       return 0;
4800 +}
4801 +
4802 +/* EISA interrupts are always polarity zero and can be edge or level
4803 + * trigger depending on the ELCR value.  If an interrupt is listed as
4804 + * EISA conforming in the MP table, that means its trigger type must
4805 + * be read in from the ELCR */
4806 +
4807 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
4808 +#define default_EISA_polarity(idx)     (0)
4809 +
4810 +/* ISA interrupts are always polarity zero edge triggered,
4811 + * when listed as conforming in the MP table. */
4812 +
4813 +#define default_ISA_trigger(idx)       (0)
4814 +#define default_ISA_polarity(idx)      (0)
4815 +
4816 +/* PCI interrupts are always polarity one level triggered,
4817 + * when listed as conforming in the MP table. */
4818 +
4819 +#define default_PCI_trigger(idx)       (1)
4820 +#define default_PCI_polarity(idx)      (1)
4821 +
4822 +/* MCA interrupts are always polarity zero level triggered,
4823 + * when listed as conforming in the MP table. */
4824 +
4825 +#define default_MCA_trigger(idx)       (1)
4826 +#define default_MCA_polarity(idx)      (0)
4827 +
4828 +/* NEC98 interrupts are always polarity zero edge triggered,
4829 + * when listed as conforming in the MP table. */
4830 +
4831 +#define default_NEC98_trigger(idx)     (0)
4832 +#define default_NEC98_polarity(idx)    (0)
4833 +
4834 +static int __init MPBIOS_polarity(int idx)
4835 +{
4836 +       int bus = mp_irqs[idx].mpc_srcbus;
4837 +       int polarity;
4838 +
4839 +       /*
4840 +        * Determine IRQ line polarity (high active or low active):
4841 +        */
4842 +       switch (mp_irqs[idx].mpc_irqflag & 3)
4843 +       {
4844 +               case 0: /* conforms, ie. bus-type dependent polarity */
4845 +               {
4846 +                       switch (mp_bus_id_to_type[bus])
4847 +                       {
4848 +                               case MP_BUS_ISA: /* ISA pin */
4849 +                               {
4850 +                                       polarity = default_ISA_polarity(idx);
4851 +                                       break;
4852 +                               }
4853 +                               case MP_BUS_EISA: /* EISA pin */
4854 +                               {
4855 +                                       polarity = default_EISA_polarity(idx);
4856 +                                       break;
4857 +                               }
4858 +                               case MP_BUS_PCI: /* PCI pin */
4859 +                               {
4860 +                                       polarity = default_PCI_polarity(idx);
4861 +                                       break;
4862 +                               }
4863 +                               case MP_BUS_MCA: /* MCA pin */
4864 +                               {
4865 +                                       polarity = default_MCA_polarity(idx);
4866 +                                       break;
4867 +                               }
4868 +                               case MP_BUS_NEC98: /* NEC 98 pin */
4869 +                               {
4870 +                                       polarity = default_NEC98_polarity(idx);
4871 +                                       break;
4872 +                               }
4873 +                               default:
4874 +                               {
4875 +                                       printk(KERN_WARNING "broken BIOS!!\n");
4876 +                                       polarity = 1;
4877 +                                       break;
4878 +                               }
4879 +                       }
4880 +                       break;
4881 +               }
4882 +               case 1: /* high active */
4883 +               {
4884 +                       polarity = 0;
4885 +                       break;
4886 +               }
4887 +               case 2: /* reserved */
4888 +               {
4889 +                       printk(KERN_WARNING "broken BIOS!!\n");
4890 +                       polarity = 1;
4891 +                       break;
4892 +               }
4893 +               case 3: /* low active */
4894 +               {
4895 +                       polarity = 1;
4896 +                       break;
4897 +               }
4898 +               default: /* invalid */
4899 +               {
4900 +                       printk(KERN_WARNING "broken BIOS!!\n");
4901 +                       polarity = 1;
4902 +                       break;
4903 +               }
4904 +       }
4905 +       return polarity;
4906 +}
4907 +
4908 +static int MPBIOS_trigger(int idx)
4909 +{
4910 +       int bus = mp_irqs[idx].mpc_srcbus;
4911 +       int trigger;
4912 +
4913 +       /*
4914 +        * Determine IRQ trigger mode (edge or level sensitive):
4915 +        */
4916 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
4917 +       {
4918 +               case 0: /* conforms, ie. bus-type dependent */
4919 +               {
4920 +                       switch (mp_bus_id_to_type[bus])
4921 +                       {
4922 +                               case MP_BUS_ISA: /* ISA pin */
4923 +                               {
4924 +                                       trigger = default_ISA_trigger(idx);
4925 +                                       break;
4926 +                               }
4927 +                               case MP_BUS_EISA: /* EISA pin */
4928 +                               {
4929 +                                       trigger = default_EISA_trigger(idx);
4930 +                                       break;
4931 +                               }
4932 +                               case MP_BUS_PCI: /* PCI pin */
4933 +                               {
4934 +                                       trigger = default_PCI_trigger(idx);
4935 +                                       break;
4936 +                               }
4937 +                               case MP_BUS_MCA: /* MCA pin */
4938 +                               {
4939 +                                       trigger = default_MCA_trigger(idx);
4940 +                                       break;
4941 +                               }
4942 +                               case MP_BUS_NEC98: /* NEC 98 pin */
4943 +                               {
4944 +                                       trigger = default_NEC98_trigger(idx);
4945 +                                       break;
4946 +                               }
4947 +                               default:
4948 +                               {
4949 +                                       printk(KERN_WARNING "broken BIOS!!\n");
4950 +                                       trigger = 1;
4951 +                                       break;
4952 +                               }
4953 +                       }
4954 +                       break;
4955 +               }
4956 +               case 1: /* edge */
4957 +               {
4958 +                       trigger = 0;
4959 +                       break;
4960 +               }
4961 +               case 2: /* reserved */
4962 +               {
4963 +                       printk(KERN_WARNING "broken BIOS!!\n");
4964 +                       trigger = 1;
4965 +                       break;
4966 +               }
4967 +               case 3: /* level */
4968 +               {
4969 +                       trigger = 1;
4970 +                       break;
4971 +               }
4972 +               default: /* invalid */
4973 +               {
4974 +                       printk(KERN_WARNING "broken BIOS!!\n");
4975 +                       trigger = 0;
4976 +                       break;
4977 +               }
4978 +       }
4979 +       return trigger;
4980 +}
4981 +
4982 +static inline int irq_polarity(int idx)
4983 +{
4984 +       return MPBIOS_polarity(idx);
4985 +}
4986 +
4987 +static inline int irq_trigger(int idx)
4988 +{
4989 +       return MPBIOS_trigger(idx);
4990 +}
4991 +
4992 +static int pin_2_irq(int idx, int apic, int pin)
4993 +{
4994 +       int irq, i;
4995 +       int bus = mp_irqs[idx].mpc_srcbus;
4996 +
4997 +       /*
4998 +        * Debugging check, we are in big trouble if this message pops up!
4999 +        */
5000 +       if (mp_irqs[idx].mpc_dstirq != pin)
5001 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
5002 +
5003 +       switch (mp_bus_id_to_type[bus])
5004 +       {
5005 +               case MP_BUS_ISA: /* ISA pin */
5006 +               case MP_BUS_EISA:
5007 +               case MP_BUS_MCA:
5008 +               case MP_BUS_NEC98:
5009 +               {
5010 +                       irq = mp_irqs[idx].mpc_srcbusirq;
5011 +                       break;
5012 +               }
5013 +               case MP_BUS_PCI: /* PCI pin */
5014 +               {
5015 +                       /*
5016 +                        * PCI IRQs are mapped in order
5017 +                        */
5018 +                       i = irq = 0;
5019 +                       while (i < apic)
5020 +                               irq += nr_ioapic_registers[i++];
5021 +                       irq += pin;
5022 +
5023 +                       /*
5024 +                        * For MPS mode, so far only needed by ES7000 platform
5025 +                        */
5026 +                       if (ioapic_renumber_irq)
5027 +                               irq = ioapic_renumber_irq(apic, irq);
5028 +
5029 +                       break;
5030 +               }
5031 +               default:
5032 +               {
5033 +                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
5034 +                       irq = 0;
5035 +                       break;
5036 +               }
5037 +       }
5038 +
5039 +       /*
5040 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
5041 +        */
5042 +       if ((pin >= 16) && (pin <= 23)) {
5043 +               if (pirq_entries[pin-16] != -1) {
5044 +                       if (!pirq_entries[pin-16]) {
5045 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5046 +                                               "disabling PIRQ%d\n", pin-16);
5047 +                       } else {
5048 +                               irq = pirq_entries[pin-16];
5049 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5050 +                                               "using PIRQ%d -> IRQ %d\n",
5051 +                                               pin-16, irq);
5052 +                       }
5053 +               }
5054 +       }
5055 +       return irq;
5056 +}
5057 +
5058 +static inline int IO_APIC_irq_trigger(int irq)
5059 +{
5060 +       int apic, idx, pin;
5061 +
5062 +       for (apic = 0; apic < nr_ioapics; apic++) {
5063 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5064 +                       idx = find_irq_entry(apic,pin,mp_INT);
5065 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
5066 +                               return irq_trigger(idx);
5067 +               }
5068 +       }
5069 +       /*
5070 +        * nonexistent IRQs are edge default
5071 +        */
5072 +       return 0;
5073 +}
5074 +
5075 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
5076 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
5077 +
5078 +int assign_irq_vector(int irq)
5079 +{
5080 +       static int current_vector = FIRST_DEVICE_VECTOR;
5081 +       physdev_op_t op;
5082 +
5083 +       BUG_ON(irq >= NR_IRQ_VECTORS);
5084 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
5085 +               return IO_APIC_VECTOR(irq);
5086 +
5087 +       op.cmd = PHYSDEVOP_ASSIGN_VECTOR;
5088 +       op.u.irq_op.irq = irq;
5089 +       if (HYPERVISOR_physdev_op(&op))
5090 +               return -ENOSPC;
5091 +       current_vector = op.u.irq_op.vector;
5092 +
5093 +       vector_irq[current_vector] = irq;
5094 +       if (irq != AUTO_ASSIGN)
5095 +               IO_APIC_VECTOR(irq) = current_vector;
5096 +
5097 +       return current_vector;
5098 +}
5099 +
5100 +#ifndef CONFIG_XEN
5101 +static struct hw_interrupt_type ioapic_level_type;
5102 +static struct hw_interrupt_type ioapic_edge_type;
5103 +
5104 +#define IOAPIC_AUTO    -1
5105 +#define IOAPIC_EDGE    0
5106 +#define IOAPIC_LEVEL   1
5107 +
5108 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
5109 +{
5110 +       if (use_pci_vector() && !platform_legacy_irq(irq)) {
5111 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5112 +                               trigger == IOAPIC_LEVEL)
5113 +                       irq_desc[vector].handler = &ioapic_level_type;
5114 +               else
5115 +                       irq_desc[vector].handler = &ioapic_edge_type;
5116 +               set_intr_gate(vector, interrupt[vector]);
5117 +       } else  {
5118 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5119 +                               trigger == IOAPIC_LEVEL)
5120 +                       irq_desc[irq].handler = &ioapic_level_type;
5121 +               else
5122 +                       irq_desc[irq].handler = &ioapic_edge_type;
5123 +               set_intr_gate(vector, interrupt[irq]);
5124 +       }
5125 +}
5126 +#else
5127 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
5128 +#endif
5129 +
5130 +static void __init setup_IO_APIC_irqs(void)
5131 +{
5132 +       struct IO_APIC_route_entry entry;
5133 +       int apic, pin, idx, irq, first_notcon = 1, vector;
5134 +       unsigned long flags;
5135 +
5136 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
5137 +
5138 +       for (apic = 0; apic < nr_ioapics; apic++) {
5139 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5140 +
5141 +               /*
5142 +                * add it to the IO-APIC irq-routing table:
5143 +                */
5144 +               memset(&entry,0,sizeof(entry));
5145 +
5146 +               entry.delivery_mode = INT_DELIVERY_MODE;
5147 +               entry.dest_mode = INT_DEST_MODE;
5148 +               entry.mask = 0;                         /* enable IRQ */
5149 +               entry.dest.logical.logical_dest = 
5150 +                                       cpu_mask_to_apicid(TARGET_CPUS);
5151 +
5152 +               idx = find_irq_entry(apic,pin,mp_INT);
5153 +               if (idx == -1) {
5154 +                       if (first_notcon) {
5155 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5156 +                                               " IO-APIC (apicid-pin) %d-%d",
5157 +                                               mp_ioapics[apic].mpc_apicid,
5158 +                                               pin);
5159 +                               first_notcon = 0;
5160 +                       } else
5161 +                               apic_printk(APIC_VERBOSE, ", %d-%d",
5162 +                                       mp_ioapics[apic].mpc_apicid, pin);
5163 +                       continue;
5164 +               }
5165 +
5166 +               entry.trigger = irq_trigger(idx);
5167 +               entry.polarity = irq_polarity(idx);
5168 +
5169 +               if (irq_trigger(idx)) {
5170 +                       entry.trigger = 1;
5171 +                       entry.mask = 1;
5172 +               }
5173 +
5174 +               irq = pin_2_irq(idx, apic, pin);
5175 +               /*
5176 +                * skip adding the timer int on secondary nodes, which causes
5177 +                * a small but painful rift in the time-space continuum
5178 +                */
5179 +               if (multi_timer_check(apic, irq))
5180 +                       continue;
5181 +               else
5182 +                       add_pin_to_irq(irq, apic, pin);
5183 +
5184 +               if (/*!apic &&*/ !IO_APIC_IRQ(irq))
5185 +                       continue;
5186 +
5187 +               if (IO_APIC_IRQ(irq)) {
5188 +                       vector = assign_irq_vector(irq);
5189 +                       entry.vector = vector;
5190 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
5191 +               
5192 +                       if (!apic && (irq < 16))
5193 +                               disable_8259A_irq(irq);
5194 +               }
5195 +               spin_lock_irqsave(&ioapic_lock, flags);
5196 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5197 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5198 +               set_native_irq_info(irq, TARGET_CPUS);
5199 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5200 +       }
5201 +       }
5202 +
5203 +       if (!first_notcon)
5204 +               apic_printk(APIC_VERBOSE, " not connected.\n");
5205 +}
5206 +
5207 +/*
5208 + * Set up the 8259A-master output pin:
5209 + */
5210 +#ifndef CONFIG_XEN
5211 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
5212 +{
5213 +       struct IO_APIC_route_entry entry;
5214 +       unsigned long flags;
5215 +
5216 +       memset(&entry,0,sizeof(entry));
5217 +
5218 +       disable_8259A_irq(0);
5219 +
5220 +       /* mask LVT0 */
5221 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5222 +
5223 +       /*
5224 +        * We use logical delivery to get the timer IRQ
5225 +        * to the first CPU.
5226 +        */
5227 +       entry.dest_mode = INT_DEST_MODE;
5228 +       entry.mask = 0;                                 /* unmask IRQ now */
5229 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5230 +       entry.delivery_mode = INT_DELIVERY_MODE;
5231 +       entry.polarity = 0;
5232 +       entry.trigger = 0;
5233 +       entry.vector = vector;
5234 +
5235 +       /*
5236 +        * The timer IRQ doesn't have to know that behind the
5237 +        * scene we have a 8259A-master in AEOI mode ...
5238 +        */
5239 +       irq_desc[0].handler = &ioapic_edge_type;
5240 +
5241 +       /*
5242 +        * Add it to the IO-APIC irq-routing table:
5243 +        */
5244 +       spin_lock_irqsave(&ioapic_lock, flags);
5245 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5246 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5247 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5248 +
5249 +       enable_8259A_irq(0);
5250 +}
5251 +
5252 +static inline void UNEXPECTED_IO_APIC(void)
5253 +{
5254 +}
5255 +
5256 +void __init print_IO_APIC(void)
5257 +{
5258 +       int apic, i;
5259 +       union IO_APIC_reg_00 reg_00;
5260 +       union IO_APIC_reg_01 reg_01;
5261 +       union IO_APIC_reg_02 reg_02;
5262 +       union IO_APIC_reg_03 reg_03;
5263 +       unsigned long flags;
5264 +
5265 +       if (apic_verbosity == APIC_QUIET)
5266 +               return;
5267 +
5268 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
5269 +       for (i = 0; i < nr_ioapics; i++)
5270 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
5271 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
5272 +
5273 +       /*
5274 +        * We are a bit conservative about what we expect.  We have to
5275 +        * know about every hardware change ASAP.
5276 +        */
5277 +       printk(KERN_INFO "testing the IO APIC.......................\n");
5278 +
5279 +       for (apic = 0; apic < nr_ioapics; apic++) {
5280 +
5281 +       spin_lock_irqsave(&ioapic_lock, flags);
5282 +       reg_00.raw = io_apic_read(apic, 0);
5283 +       reg_01.raw = io_apic_read(apic, 1);
5284 +       if (reg_01.bits.version >= 0x10)
5285 +               reg_02.raw = io_apic_read(apic, 2);
5286 +       if (reg_01.bits.version >= 0x20)
5287 +               reg_03.raw = io_apic_read(apic, 3);
5288 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5289 +
5290 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
5291 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
5292 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
5293 +       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
5294 +       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
5295 +       if (reg_00.bits.ID >= get_physical_broadcast())
5296 +               UNEXPECTED_IO_APIC();
5297 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
5298 +               UNEXPECTED_IO_APIC();
5299 +
5300 +       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
5301 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
5302 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
5303 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
5304 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
5305 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
5306 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
5307 +               (reg_01.bits.entries != 0x2E) &&
5308 +               (reg_01.bits.entries != 0x3F)
5309 +       )
5310 +               UNEXPECTED_IO_APIC();
5311 +
5312 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
5313 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
5314 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
5315 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
5316 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
5317 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
5318 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
5319 +       )
5320 +               UNEXPECTED_IO_APIC();
5321 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
5322 +               UNEXPECTED_IO_APIC();
5323 +
5324 +       /*
5325 +        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
5326 +        * but the value of reg_02 is read as the previous read register
5327 +        * value, so ignore it if reg_02 == reg_01.
5328 +        */
5329 +       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
5330 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
5331 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
5332 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
5333 +                       UNEXPECTED_IO_APIC();
5334 +       }
5335 +
5336 +       /*
5337 +        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
5338 +        * or reg_03, but the value of reg_0[23] is read as the previous read
5339 +        * register value, so ignore it if reg_03 == reg_0[12].
5340 +        */
5341 +       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
5342 +           reg_03.raw != reg_01.raw) {
5343 +               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
5344 +               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
5345 +               if (reg_03.bits.__reserved_1)
5346 +                       UNEXPECTED_IO_APIC();
5347 +       }
5348 +
5349 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
5350 +
5351 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
5352 +                         " Stat Dest Deli Vect:   \n");
5353 +
5354 +       for (i = 0; i <= reg_01.bits.entries; i++) {
5355 +               struct IO_APIC_route_entry entry;
5356 +
5357 +               spin_lock_irqsave(&ioapic_lock, flags);
5358 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
5359 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
5360 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5361 +
5362 +               printk(KERN_DEBUG " %02x %03X %02X  ",
5363 +                       i,
5364 +                       entry.dest.logical.logical_dest,
5365 +                       entry.dest.physical.physical_dest
5366 +               );
5367 +
5368 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
5369 +                       entry.mask,
5370 +                       entry.trigger,
5371 +                       entry.irr,
5372 +                       entry.polarity,
5373 +                       entry.delivery_status,
5374 +                       entry.dest_mode,
5375 +                       entry.delivery_mode,
5376 +                       entry.vector
5377 +               );
5378 +       }
5379 +       }
5380 +       if (use_pci_vector())
5381 +               printk(KERN_INFO "Using vector-based indexing\n");
5382 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
5383 +       for (i = 0; i < NR_IRQS; i++) {
5384 +               struct irq_pin_list *entry = irq_2_pin + i;
5385 +               if (entry->pin < 0)
5386 +                       continue;
5387 +               if (use_pci_vector() && !platform_legacy_irq(i))
5388 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
5389 +               else
5390 +                       printk(KERN_DEBUG "IRQ%d ", i);
5391 +               for (;;) {
5392 +                       printk("-> %d:%d", entry->apic, entry->pin);
5393 +                       if (!entry->next)
5394 +                               break;
5395 +                       entry = irq_2_pin + entry->next;
5396 +               }
5397 +               printk("\n");
5398 +       }
5399 +
5400 +       printk(KERN_INFO ".................................... done.\n");
5401 +
5402 +       return;
5403 +}
5404 +
5405 +#if 0
5406 +
5407 +static void print_APIC_bitfield (int base)
5408 +{
5409 +       unsigned int v;
5410 +       int i, j;
5411 +
5412 +       if (apic_verbosity == APIC_QUIET)
5413 +               return;
5414 +
5415 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
5416 +       for (i = 0; i < 8; i++) {
5417 +               v = apic_read(base + i*0x10);
5418 +               for (j = 0; j < 32; j++) {
5419 +                       if (v & (1<<j))
5420 +                               printk("1");
5421 +                       else
5422 +                               printk("0");
5423 +               }
5424 +               printk("\n");
5425 +       }
5426 +}
5427 +
5428 +void /*__init*/ print_local_APIC(void * dummy)
5429 +{
5430 +       unsigned int v, ver, maxlvt;
5431 +
5432 +       if (apic_verbosity == APIC_QUIET)
5433 +               return;
5434 +
5435 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
5436 +               smp_processor_id(), hard_smp_processor_id());
5437 +       v = apic_read(APIC_ID);
5438 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
5439 +       v = apic_read(APIC_LVR);
5440 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
5441 +       ver = GET_APIC_VERSION(v);
5442 +       maxlvt = get_maxlvt();
5443 +
5444 +       v = apic_read(APIC_TASKPRI);
5445 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
5446 +
5447 +       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
5448 +               v = apic_read(APIC_ARBPRI);
5449 +               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
5450 +                       v & APIC_ARBPRI_MASK);
5451 +               v = apic_read(APIC_PROCPRI);
5452 +               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
5453 +       }
5454 +
5455 +       v = apic_read(APIC_EOI);
5456 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
5457 +       v = apic_read(APIC_RRR);
5458 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
5459 +       v = apic_read(APIC_LDR);
5460 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
5461 +       v = apic_read(APIC_DFR);
5462 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
5463 +       v = apic_read(APIC_SPIV);
5464 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
5465 +
5466 +       printk(KERN_DEBUG "... APIC ISR field:\n");
5467 +       print_APIC_bitfield(APIC_ISR);
5468 +       printk(KERN_DEBUG "... APIC TMR field:\n");
5469 +       print_APIC_bitfield(APIC_TMR);
5470 +       printk(KERN_DEBUG "... APIC IRR field:\n");
5471 +       print_APIC_bitfield(APIC_IRR);
5472 +
5473 +       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
5474 +               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
5475 +                       apic_write(APIC_ESR, 0);
5476 +               v = apic_read(APIC_ESR);
5477 +               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
5478 +       }
5479 +
5480 +       v = apic_read(APIC_ICR);
5481 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
5482 +       v = apic_read(APIC_ICR2);
5483 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
5484 +
5485 +       v = apic_read(APIC_LVTT);
5486 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
5487 +
5488 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
5489 +               v = apic_read(APIC_LVTPC);
5490 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
5491 +       }
5492 +       v = apic_read(APIC_LVT0);
5493 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
5494 +       v = apic_read(APIC_LVT1);
5495 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
5496 +
5497 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
5498 +               v = apic_read(APIC_LVTERR);
5499 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
5500 +       }
5501 +
5502 +       v = apic_read(APIC_TMICT);
5503 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
5504 +       v = apic_read(APIC_TMCCT);
5505 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
5506 +       v = apic_read(APIC_TDCR);
5507 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
5508 +       printk("\n");
5509 +}
5510 +
5511 +void print_all_local_APICs (void)
5512 +{
5513 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
5514 +}
5515 +
5516 +void /*__init*/ print_PIC(void)
5517 +{
5518 +       unsigned int v;
5519 +       unsigned long flags;
5520 +
5521 +       if (apic_verbosity == APIC_QUIET)
5522 +               return;
5523 +
5524 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
5525 +
5526 +       spin_lock_irqsave(&i8259A_lock, flags);
5527 +
5528 +       v = inb(0xa1) << 8 | inb(0x21);
5529 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
5530 +
5531 +       v = inb(0xa0) << 8 | inb(0x20);
5532 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
5533 +
5534 +       outb(0x0b,0xa0);
5535 +       outb(0x0b,0x20);
5536 +       v = inb(0xa0) << 8 | inb(0x20);
5537 +       outb(0x0a,0xa0);
5538 +       outb(0x0a,0x20);
5539 +
5540 +       spin_unlock_irqrestore(&i8259A_lock, flags);
5541 +
5542 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
5543 +
5544 +       v = inb(0x4d1) << 8 | inb(0x4d0);
5545 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
5546 +}
5547 +
5548 +#endif  /*  0  */
5549 +
5550 +#else
5551 +void __init print_IO_APIC(void) { }
5552 +#endif /* !CONFIG_XEN */
5553 +
5554 +static void __init enable_IO_APIC(void)
5555 +{
5556 +       union IO_APIC_reg_01 reg_01;
5557 +       int i8259_apic, i8259_pin;
5558 +       int i, apic;
5559 +       unsigned long flags;
5560 +
5561 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
5562 +               irq_2_pin[i].pin = -1;
5563 +               irq_2_pin[i].next = 0;
5564 +       }
5565 +       if (!pirqs_enabled)
5566 +               for (i = 0; i < MAX_PIRQS; i++)
5567 +                       pirq_entries[i] = -1;
5568 +
5569 +       /*
5570 +        * The number of IO-APIC IRQ registers (== #pins):
5571 +        */
5572 +       for (apic = 0; apic < nr_ioapics; apic++) {
5573 +               spin_lock_irqsave(&ioapic_lock, flags);
5574 +               reg_01.raw = io_apic_read(apic, 1);
5575 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5576 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
5577 +       }
5578 +       for(apic = 0; apic < nr_ioapics; apic++) {
5579 +               int pin;
5580 +               /* See if any of the pins is in ExtINT mode */
5581 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5582 +                       struct IO_APIC_route_entry entry;
5583 +                       spin_lock_irqsave(&ioapic_lock, flags);
5584 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5585 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5586 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
5587 +
5588 +
5589 +                       /* If the interrupt line is enabled and in ExtInt mode
5590 +                        * I have found the pin where the i8259 is connected.
5591 +                        */
5592 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
5593 +                               ioapic_i8259.apic = apic;
5594 +                               ioapic_i8259.pin  = pin;
5595 +                               goto found_i8259;
5596 +                       }
5597 +               }
5598 +       }
5599 + found_i8259:
5600 +       /* Look to see what if the MP table has reported the ExtINT */
5601 +       /* If we could not find the appropriate pin by looking at the ioapic
5602 +        * the i8259 probably is not connected the ioapic but give the
5603 +        * mptable a chance anyway.
5604 +        */
5605 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
5606 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
5607 +       /* Trust the MP table if nothing is setup in the hardware */
5608 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
5609 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
5610 +               ioapic_i8259.pin  = i8259_pin;
5611 +               ioapic_i8259.apic = i8259_apic;
5612 +       }
5613 +       /* Complain if the MP table and the hardware disagree */
5614 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
5615 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
5616 +       {
5617 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
5618 +       }
5619 +
5620 +       /*
5621 +        * Do not trust the IO-APIC being empty at bootup
5622 +        */
5623 +       clear_IO_APIC();
5624 +}
5625 +
5626 +/*
5627 + * Not an __init, needed by the reboot code
5628 + */
5629 +void disable_IO_APIC(void)
5630 +{
5631 +       /*
5632 +        * Clear the IO-APIC before rebooting:
5633 +        */
5634 +       clear_IO_APIC();
5635 +
5636 +#ifndef CONFIG_XEN
5637 +       /*
5638 +        * If the i8259 is routed through an IOAPIC
5639 +        * Put that IOAPIC in virtual wire mode
5640 +        * so legacy interrupts can be delivered.
5641 +        */
5642 +       if (ioapic_i8259.pin != -1) {
5643 +               struct IO_APIC_route_entry entry;
5644 +               unsigned long flags;
5645 +
5646 +               memset(&entry, 0, sizeof(entry));
5647 +               entry.mask            = 0; /* Enabled */
5648 +               entry.trigger         = 0; /* Edge */
5649 +               entry.irr             = 0;
5650 +               entry.polarity        = 0; /* High */
5651 +               entry.delivery_status = 0;
5652 +               entry.dest_mode       = 0; /* Physical */
5653 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
5654 +               entry.vector          = 0;
5655 +               entry.dest.physical.physical_dest =
5656 +                                       GET_APIC_ID(apic_read(APIC_ID));
5657 +
5658 +               /*
5659 +                * Add it to the IO-APIC irq-routing table:
5660 +                */
5661 +               spin_lock_irqsave(&ioapic_lock, flags);
5662 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
5663 +                       *(((int *)&entry)+1));
5664 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
5665 +                       *(((int *)&entry)+0));
5666 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5667 +       }
5668 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
5669 +#endif
5670 +}
5671 +
5672 +/*
5673 + * function to set the IO-APIC physical IDs based on the
5674 + * values stored in the MPC table.
5675 + *
5676 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
5677 + */
5678 +
5679 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
5680 +static void __init setup_ioapic_ids_from_mpc(void)
5681 +{
5682 +       union IO_APIC_reg_00 reg_00;
5683 +       physid_mask_t phys_id_present_map;
5684 +       int apic;
5685 +       int i;
5686 +       unsigned char old_id;
5687 +       unsigned long flags;
5688 +
5689 +       /*
5690 +        * Don't check I/O APIC IDs for xAPIC systems.  They have
5691 +        * no meaning without the serial APIC bus.
5692 +        */
5693 +       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 < 15))
5694 +               return;
5695 +       /*
5696 +        * This is broken; anything with a real cpu count has to
5697 +        * circumvent this idiocy regardless.
5698 +        */
5699 +       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
5700 +
5701 +       /*
5702 +        * Set the IOAPIC ID to the value stored in the MPC table.
5703 +        */
5704 +       for (apic = 0; apic < nr_ioapics; apic++) {
5705 +
5706 +               /* Read the register 0 value */
5707 +               spin_lock_irqsave(&ioapic_lock, flags);
5708 +               reg_00.raw = io_apic_read(apic, 0);
5709 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5710 +               
5711 +               old_id = mp_ioapics[apic].mpc_apicid;
5712 +
5713 +               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
5714 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
5715 +                               apic, mp_ioapics[apic].mpc_apicid);
5716 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5717 +                               reg_00.bits.ID);
5718 +                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
5719 +               }
5720 +
5721 +               /*
5722 +                * Sanity check, is the ID really free? Every APIC in a
5723 +                * system must have a unique ID or we get lots of nice
5724 +                * 'stuck on smp_invalidate_needed IPI wait' messages.
5725 +                */
5726 +               if (check_apicid_used(phys_id_present_map,
5727 +                                       mp_ioapics[apic].mpc_apicid)) {
5728 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5729 +                               apic, mp_ioapics[apic].mpc_apicid);
5730 +                       for (i = 0; i < get_physical_broadcast(); i++)
5731 +                               if (!physid_isset(i, phys_id_present_map))
5732 +                                       break;
5733 +                       if (i >= get_physical_broadcast())
5734 +                               panic("Max APIC ID exceeded!\n");
5735 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5736 +                               i);
5737 +                       physid_set(i, phys_id_present_map);
5738 +                       mp_ioapics[apic].mpc_apicid = i;
5739 +               } else {
5740 +                       physid_mask_t tmp;
5741 +                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5742 +                       apic_printk(APIC_VERBOSE, "Setting %d in the "
5743 +                                       "phys_id_present_map\n",
5744 +                                       mp_ioapics[apic].mpc_apicid);
5745 +                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
5746 +               }
5747 +
5748 +
5749 +               /*
5750 +                * We need to adjust the IRQ routing table
5751 +                * if the ID changed.
5752 +                */
5753 +               if (old_id != mp_ioapics[apic].mpc_apicid)
5754 +                       for (i = 0; i < mp_irq_entries; i++)
5755 +                               if (mp_irqs[i].mpc_dstapic == old_id)
5756 +                                       mp_irqs[i].mpc_dstapic
5757 +                                               = mp_ioapics[apic].mpc_apicid;
5758 +
5759 +               /*
5760 +                * Read the right value from the MPC table and
5761 +                * write it into the ID register.
5762 +                */
5763 +               apic_printk(APIC_VERBOSE, KERN_INFO
5764 +                       "...changing IO-APIC physical APIC ID to %d ...",
5765 +                       mp_ioapics[apic].mpc_apicid);
5766 +
5767 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
5768 +               spin_lock_irqsave(&ioapic_lock, flags);
5769 +               io_apic_write(apic, 0, reg_00.raw);
5770 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5771 +
5772 +               /*
5773 +                * Sanity check
5774 +                */
5775 +               spin_lock_irqsave(&ioapic_lock, flags);
5776 +               reg_00.raw = io_apic_read(apic, 0);
5777 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5778 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
5779 +                       printk("could not set ID!\n");
5780 +               else
5781 +                       apic_printk(APIC_VERBOSE, " ok.\n");
5782 +       }
5783 +}
5784 +#else
5785 +static void __init setup_ioapic_ids_from_mpc(void) { }
5786 +#endif
5787 +
5788 +#ifndef CONFIG_XEN
5789 +/*
5790 + * There is a nasty bug in some older SMP boards, their mptable lies
5791 + * about the timer IRQ. We do the following to work around the situation:
5792 + *
5793 + *     - timer IRQ defaults to IO-APIC IRQ
5794 + *     - if this function detects that timer IRQs are defunct, then we fall
5795 + *       back to ISA timer IRQs
5796 + */
5797 +static int __init timer_irq_works(void)
5798 +{
5799 +       unsigned long t1 = jiffies;
5800 +
5801 +       local_irq_enable();
5802 +       /* Let ten ticks pass... */
5803 +       mdelay((10 * 1000) / HZ);
5804 +
5805 +       /*
5806 +        * Expect a few ticks at least, to be sure some possible
5807 +        * glue logic does not lock up after one or two first
5808 +        * ticks in a non-ExtINT mode.  Also the local APIC
5809 +        * might have cached one ExtINT interrupt.  Finally, at
5810 +        * least one tick may be lost due to delays.
5811 +        */
5812 +       if (jiffies - t1 > 4)
5813 +               return 1;
5814 +
5815 +       return 0;
5816 +}
5817 +
5818 +/*
5819 + * In the SMP+IOAPIC case it might happen that there are an unspecified
5820 + * number of pending IRQ events unhandled. These cases are very rare,
5821 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
5822 + * better to do it this way as thus we do not have to be aware of
5823 + * 'pending' interrupts in the IRQ path, except at this point.
5824 + */
5825 +/*
5826 + * Edge triggered needs to resend any interrupt
5827 + * that was delayed but this is now handled in the device
5828 + * independent code.
5829 + */
5830 +
5831 +/*
5832 + * Starting up a edge-triggered IO-APIC interrupt is
5833 + * nasty - we need to make sure that we get the edge.
5834 + * If it is already asserted for some reason, we need
5835 + * return 1 to indicate that is was pending.
5836 + *
5837 + * This is not complete - we should be able to fake
5838 + * an edge even if it isn't on the 8259A...
5839 + */
5840 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
5841 +{
5842 +       int was_pending = 0;
5843 +       unsigned long flags;
5844 +
5845 +       spin_lock_irqsave(&ioapic_lock, flags);
5846 +       if (irq < 16) {
5847 +               disable_8259A_irq(irq);
5848 +               if (i8259A_irq_pending(irq))
5849 +                       was_pending = 1;
5850 +       }
5851 +       __unmask_IO_APIC_irq(irq);
5852 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5853 +
5854 +       return was_pending;
5855 +}
5856 +
5857 +/*
5858 + * Once we have recorded IRQ_PENDING already, we can mask the
5859 + * interrupt for real. This prevents IRQ storms from unhandled
5860 + * devices.
5861 + */
5862 +static void ack_edge_ioapic_irq(unsigned int irq)
5863 +{
5864 +       move_irq(irq);
5865 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
5866 +                                       == (IRQ_PENDING | IRQ_DISABLED))
5867 +               mask_IO_APIC_irq(irq);
5868 +       ack_APIC_irq();
5869 +}
5870 +
5871 +/*
5872 + * Level triggered interrupts can just be masked,
5873 + * and shutting down and starting up the interrupt
5874 + * is the same as enabling and disabling them -- except
5875 + * with a startup need to return a "was pending" value.
5876 + *
5877 + * Level triggered interrupts are special because we
5878 + * do not touch any IO-APIC register while handling
5879 + * them. We ack the APIC in the end-IRQ handler, not
5880 + * in the start-IRQ-handler. Protection against reentrance
5881 + * from the same interrupt is still provided, both by the
5882 + * generic IRQ layer and by the fact that an unacked local
5883 + * APIC does not accept IRQs.
5884 + */
5885 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
5886 +{
5887 +       unmask_IO_APIC_irq(irq);
5888 +
5889 +       return 0; /* don't check for pending */
5890 +}
5891 +
5892 +static void end_level_ioapic_irq (unsigned int irq)
5893 +{
5894 +       unsigned long v;
5895 +       int i;
5896 +
5897 +       move_irq(irq);
5898 +/*
5899 + * It appears there is an erratum which affects at least version 0x11
5900 + * of I/O APIC (that's the 82093AA and cores integrated into various
5901 + * chipsets).  Under certain conditions a level-triggered interrupt is
5902 + * erroneously delivered as edge-triggered one but the respective IRR
5903 + * bit gets set nevertheless.  As a result the I/O unit expects an EOI
5904 + * message but it will never arrive and further interrupts are blocked
5905 + * from the source.  The exact reason is so far unknown, but the
5906 + * phenomenon was observed when two consecutive interrupt requests
5907 + * from a given source get delivered to the same CPU and the source is
5908 + * temporarily disabled in between.
5909 + *
5910 + * A workaround is to simulate an EOI message manually.  We achieve it
5911 + * by setting the trigger mode to edge and then to level when the edge
5912 + * trigger mode gets detected in the TMR of a local APIC for a
5913 + * level-triggered interrupt.  We mask the source for the time of the
5914 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
5915 + * The idea is from Manfred Spraul.  --macro
5916 + */
5917 +       i = IO_APIC_VECTOR(irq);
5918 +
5919 +       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
5920 +
5921 +       ack_APIC_irq();
5922 +
5923 +       if (!(v & (1 << (i & 0x1f)))) {
5924 +               atomic_inc(&irq_mis_count);
5925 +               spin_lock(&ioapic_lock);
5926 +               __mask_and_edge_IO_APIC_irq(irq);
5927 +               __unmask_and_level_IO_APIC_irq(irq);
5928 +               spin_unlock(&ioapic_lock);
5929 +       }
5930 +}
5931 +
5932 +#ifdef CONFIG_PCI_MSI
5933 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
5934 +{
5935 +       int irq = vector_to_irq(vector);
5936 +
5937 +       return startup_edge_ioapic_irq(irq);
5938 +}
5939 +
5940 +static void ack_edge_ioapic_vector(unsigned int vector)
5941 +{
5942 +       int irq = vector_to_irq(vector);
5943 +
5944 +       move_native_irq(vector);
5945 +       ack_edge_ioapic_irq(irq);
5946 +}
5947 +
5948 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
5949 +{
5950 +       int irq = vector_to_irq(vector);
5951 +
5952 +       return startup_level_ioapic_irq (irq);
5953 +}
5954 +
5955 +static void end_level_ioapic_vector (unsigned int vector)
5956 +{
5957 +       int irq = vector_to_irq(vector);
5958 +
5959 +       move_native_irq(vector);
5960 +       end_level_ioapic_irq(irq);
5961 +}
5962 +
5963 +static void mask_IO_APIC_vector (unsigned int vector)
5964 +{
5965 +       int irq = vector_to_irq(vector);
5966 +
5967 +       mask_IO_APIC_irq(irq);
5968 +}
5969 +
5970 +static void unmask_IO_APIC_vector (unsigned int vector)
5971 +{
5972 +       int irq = vector_to_irq(vector);
5973 +
5974 +       unmask_IO_APIC_irq(irq);
5975 +}
5976 +
5977 +#ifdef CONFIG_SMP
5978 +static void set_ioapic_affinity_vector (unsigned int vector,
5979 +                                       cpumask_t cpu_mask)
5980 +{
5981 +       int irq = vector_to_irq(vector);
5982 +
5983 +       set_native_irq_info(vector, cpu_mask);
5984 +       set_ioapic_affinity_irq(irq, cpu_mask);
5985 +}
5986 +#endif
5987 +#endif
5988 +
5989 +/*
5990 + * Level and edge triggered IO-APIC interrupts need different handling,
5991 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
5992 + * handled with the level-triggered descriptor, but that one has slightly
5993 + * more overhead. Level-triggered interrupts cannot be handled with the
5994 + * edge-triggered handler, without risking IRQ storms and other ugly
5995 + * races.
5996 + */
5997 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
5998 +       .typename       = "IO-APIC-edge",
5999 +       .startup        = startup_edge_ioapic,
6000 +       .shutdown       = shutdown_edge_ioapic,
6001 +       .enable         = enable_edge_ioapic,
6002 +       .disable        = disable_edge_ioapic,
6003 +       .ack            = ack_edge_ioapic,
6004 +       .end            = end_edge_ioapic,
6005 +#ifdef CONFIG_SMP
6006 +       .set_affinity   = set_ioapic_affinity,
6007 +#endif
6008 +};
6009 +
6010 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
6011 +       .typename       = "IO-APIC-level",
6012 +       .startup        = startup_level_ioapic,
6013 +       .shutdown       = shutdown_level_ioapic,
6014 +       .enable         = enable_level_ioapic,
6015 +       .disable        = disable_level_ioapic,
6016 +       .ack            = mask_and_ack_level_ioapic,
6017 +       .end            = end_level_ioapic,
6018 +#ifdef CONFIG_SMP
6019 +       .set_affinity   = set_ioapic_affinity,
6020 +#endif
6021 +};
6022 +#endif /* !CONFIG_XEN */
6023 +
6024 +static inline void init_IO_APIC_traps(void)
6025 +{
6026 +       int irq;
6027 +
6028 +       /*
6029 +        * NOTE! The local APIC isn't very good at handling
6030 +        * multiple interrupts at the same interrupt level.
6031 +        * As the interrupt level is determined by taking the
6032 +        * vector number and shifting that right by 4, we
6033 +        * want to spread these out a bit so that they don't
6034 +        * all fall in the same interrupt level.
6035 +        *
6036 +        * Also, we've got to be careful not to trash gate
6037 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
6038 +        */
6039 +       for (irq = 0; irq < NR_IRQS ; irq++) {
6040 +               int tmp = irq;
6041 +               if (use_pci_vector()) {
6042 +                       if (!platform_legacy_irq(tmp))
6043 +                               if ((tmp = vector_to_irq(tmp)) == -1)
6044 +                                       continue;
6045 +               }
6046 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
6047 +                       /*
6048 +                        * Hmm.. We don't have an entry for this,
6049 +                        * so default to an old-fashioned 8259
6050 +                        * interrupt if we can..
6051 +                        */
6052 +                       if (irq < 16)
6053 +                               make_8259A_irq(irq);
6054 +#ifndef CONFIG_XEN
6055 +                       else
6056 +                               /* Strange. Oh, well.. */
6057 +                               irq_desc[irq].handler = &no_irq_type;
6058 +#endif
6059 +               }
6060 +       }
6061 +}
6062 +
6063 +#ifndef CONFIG_XEN
6064 +static void enable_lapic_irq (unsigned int irq)
6065 +{
6066 +       unsigned long v;
6067 +
6068 +       v = apic_read(APIC_LVT0);
6069 +       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
6070 +}
6071 +
6072 +static void disable_lapic_irq (unsigned int irq)
6073 +{
6074 +       unsigned long v;
6075 +
6076 +       v = apic_read(APIC_LVT0);
6077 +       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
6078 +}
6079 +
6080 +static void ack_lapic_irq (unsigned int irq)
6081 +{
6082 +       ack_APIC_irq();
6083 +}
6084 +
6085 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
6086 +
6087 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
6088 +       .typename       = "local-APIC-edge",
6089 +       .startup        = NULL, /* startup_irq() not used for IRQ0 */
6090 +       .shutdown       = NULL, /* shutdown_irq() not used for IRQ0 */
6091 +       .enable         = enable_lapic_irq,
6092 +       .disable        = disable_lapic_irq,
6093 +       .ack            = ack_lapic_irq,
6094 +       .end            = end_lapic_irq
6095 +};
6096 +
6097 +static void setup_nmi (void)
6098 +{
6099 +       /*
6100 +        * Dirty trick to enable the NMI watchdog ...
6101 +        * We put the 8259A master into AEOI mode and
6102 +        * unmask on all local APICs LVT0 as NMI.
6103 +        *
6104 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
6105 +        * is from Maciej W. Rozycki - so we do not have to EOI from
6106 +        * the NMI handler or the timer interrupt.
6107 +        */ 
6108 +       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
6109 +
6110 +       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
6111 +
6112 +       apic_printk(APIC_VERBOSE, " done.\n");
6113 +}
6114 +
6115 +/*
6116 + * This looks a bit hackish but it's about the only one way of sending
6117 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
6118 + * not support the ExtINT mode, unfortunately.  We need to send these
6119 + * cycles as some i82489DX-based boards have glue logic that keeps the
6120 + * 8259A interrupt line asserted until INTA.  --macro
6121 + */
6122 +static inline void unlock_ExtINT_logic(void)
6123 +{
6124 +       int apic, pin, i;
6125 +       struct IO_APIC_route_entry entry0, entry1;
6126 +       unsigned char save_control, save_freq_select;
6127 +       unsigned long flags;
6128 +
6129 +       pin  = find_isa_irq_pin(8, mp_INT);
6130 +       apic = find_isa_irq_apic(8, mp_INT);
6131 +       if (pin == -1)
6132 +               return;
6133 +
6134 +       spin_lock_irqsave(&ioapic_lock, flags);
6135 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
6136 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
6137 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6138 +       clear_IO_APIC_pin(apic, pin);
6139 +
6140 +       memset(&entry1, 0, sizeof(entry1));
6141 +
6142 +       entry1.dest_mode = 0;                   /* physical delivery */
6143 +       entry1.mask = 0;                        /* unmask IRQ now */
6144 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
6145 +       entry1.delivery_mode = dest_ExtINT;
6146 +       entry1.polarity = entry0.polarity;
6147 +       entry1.trigger = 0;
6148 +       entry1.vector = 0;
6149 +
6150 +       spin_lock_irqsave(&ioapic_lock, flags);
6151 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
6152 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
6153 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6154 +
6155 +       save_control = CMOS_READ(RTC_CONTROL);
6156 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
6157 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
6158 +                  RTC_FREQ_SELECT);
6159 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
6160 +
6161 +       i = 100;
6162 +       while (i-- > 0) {
6163 +               mdelay(10);
6164 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
6165 +                       i -= 10;
6166 +       }
6167 +
6168 +       CMOS_WRITE(save_control, RTC_CONTROL);
6169 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
6170 +       clear_IO_APIC_pin(apic, pin);
6171 +
6172 +       spin_lock_irqsave(&ioapic_lock, flags);
6173 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
6174 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
6175 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6176 +}
6177 +
6178 +/*
6179 + * This code may look a bit paranoid, but it's supposed to cooperate with
6180 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
6181 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
6182 + * fanatically on his truly buggy board.
6183 + */
6184 +static inline void check_timer(void)
6185 +{
6186 +       int apic1, pin1, apic2, pin2;
6187 +       int vector;
6188 +
6189 +       /*
6190 +        * get/set the timer IRQ vector:
6191 +        */
6192 +       disable_8259A_irq(0);
6193 +       vector = assign_irq_vector(0);
6194 +       set_intr_gate(vector, interrupt[0]);
6195 +
6196 +       /*
6197 +        * Subtle, code in do_timer_interrupt() expects an AEOI
6198 +        * mode for the 8259A whenever interrupts are routed
6199 +        * through I/O APICs.  Also IRQ0 has to be enabled in
6200 +        * the 8259A which implies the virtual wire has to be
6201 +        * disabled in the local APIC.
6202 +        */
6203 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6204 +       init_8259A(1);
6205 +       timer_ack = 1;
6206 +       if (timer_over_8254 > 0)
6207 +               enable_8259A_irq(0);
6208 +
6209 +       pin1  = find_isa_irq_pin(0, mp_INT);
6210 +       apic1 = find_isa_irq_apic(0, mp_INT);
6211 +       pin2  = ioapic_i8259.pin;
6212 +       apic2 = ioapic_i8259.apic;
6213 +
6214 +       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
6215 +               vector, apic1, pin1, apic2, pin2);
6216 +
6217 +       if (pin1 != -1) {
6218 +               /*
6219 +                * Ok, does IRQ0 through the IOAPIC work?
6220 +                */
6221 +               unmask_IO_APIC_irq(0);
6222 +               if (timer_irq_works()) {
6223 +                       if (nmi_watchdog == NMI_IO_APIC) {
6224 +                               disable_8259A_irq(0);
6225 +                               setup_nmi();
6226 +                               enable_8259A_irq(0);
6227 +                       }
6228 +                       if (disable_timer_pin_1 > 0)
6229 +                               clear_IO_APIC_pin(0, pin1);
6230 +                       return;
6231 +               }
6232 +               clear_IO_APIC_pin(apic1, pin1);
6233 +               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
6234 +                               "IO-APIC\n");
6235 +       }
6236 +
6237 +       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
6238 +       if (pin2 != -1) {
6239 +               printk("\n..... (found pin %d) ...", pin2);
6240 +               /*
6241 +                * legacy devices should be connected to IO APIC #0
6242 +                */
6243 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
6244 +               if (timer_irq_works()) {
6245 +                       printk("works.\n");
6246 +                       if (pin1 != -1)
6247 +                               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
6248 +                       else
6249 +                               add_pin_to_irq(0, apic2, pin2);
6250 +                       if (nmi_watchdog == NMI_IO_APIC) {
6251 +                               setup_nmi();
6252 +                       }
6253 +                       return;
6254 +               }
6255 +               /*
6256 +                * Cleanup, just in case ...
6257 +                */
6258 +               clear_IO_APIC_pin(apic2, pin2);
6259 +       }
6260 +       printk(" failed.\n");
6261 +
6262 +       if (nmi_watchdog == NMI_IO_APIC) {
6263 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
6264 +               nmi_watchdog = 0;
6265 +       }
6266 +
6267 +       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
6268 +
6269 +       disable_8259A_irq(0);
6270 +       irq_desc[0].handler = &lapic_irq_type;
6271 +       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
6272 +       enable_8259A_irq(0);
6273 +
6274 +       if (timer_irq_works()) {
6275 +               printk(" works.\n");
6276 +               return;
6277 +       }
6278 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
6279 +       printk(" failed.\n");
6280 +
6281 +       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
6282 +
6283 +       timer_ack = 0;
6284 +       init_8259A(0);
6285 +       make_8259A_irq(0);
6286 +       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
6287 +
6288 +       unlock_ExtINT_logic();
6289 +
6290 +       if (timer_irq_works()) {
6291 +               printk(" works.\n");
6292 +               return;
6293 +       }
6294 +       printk(" failed :(.\n");
6295 +       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
6296 +               "report.  Then try booting with the 'noapic' option");
6297 +}
6298 +#else
6299 +#define check_timer() ((void)0)
6300 +#endif
6301 +
6302 +/*
6303 + *
6304 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
6305 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
6306 + *   Linux doesn't really care, as it's not actually used
6307 + *   for any interrupt handling anyway.
6308 + */
6309 +#define PIC_IRQS       (1 << PIC_CASCADE_IR)
6310 +
6311 +void __init setup_IO_APIC(void)
6312 +{
6313 +       enable_IO_APIC();
6314 +
6315 +       if (acpi_ioapic)
6316 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
6317 +       else
6318 +               io_apic_irqs = ~PIC_IRQS;
6319 +
6320 +       printk("ENABLING IO-APIC IRQs\n");
6321 +
6322 +       /*
6323 +        * Set up IO-APIC IRQ routing.
6324 +        */
6325 +       if (!acpi_ioapic)
6326 +               setup_ioapic_ids_from_mpc();
6327 +#ifndef CONFIG_XEN
6328 +       sync_Arb_IDs();
6329 +#endif
6330 +       setup_IO_APIC_irqs();
6331 +       init_IO_APIC_traps();
6332 +       check_timer();
6333 +       if (!acpi_ioapic)
6334 +               print_IO_APIC();
6335 +}
6336 +
6337 +static int __init setup_disable_8254_timer(char *s)
6338 +{
6339 +       timer_over_8254 = -1;
6340 +       return 1;
6341 +}
6342 +static int __init setup_enable_8254_timer(char *s)
6343 +{
6344 +       timer_over_8254 = 2;
6345 +       return 1;
6346 +}
6347 +
6348 +__setup("disable_8254_timer", setup_disable_8254_timer);
6349 +__setup("enable_8254_timer", setup_enable_8254_timer);
6350 +
6351 +/*
6352 + *     Called after all the initialization is done. If we didnt find any
6353 + *     APIC bugs then we can allow the modify fast path
6354 + */
6355
6356 +static int __init io_apic_bug_finalize(void)
6357 +{
6358 +       if(sis_apic_bug == -1)
6359 +               sis_apic_bug = 0;
6360 +       return 0;
6361 +}
6362 +
6363 +late_initcall(io_apic_bug_finalize);
6364 +
6365 +struct sysfs_ioapic_data {
6366 +       struct sys_device dev;
6367 +       struct IO_APIC_route_entry entry[0];
6368 +};
6369 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
6370 +
6371 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
6372 +{
6373 +       struct IO_APIC_route_entry *entry;
6374 +       struct sysfs_ioapic_data *data;
6375 +       unsigned long flags;
6376 +       int i;
6377 +       
6378 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
6379 +       entry = data->entry;
6380 +       spin_lock_irqsave(&ioapic_lock, flags);
6381 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6382 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
6383 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
6384 +       }
6385 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6386 +
6387 +       return 0;
6388 +}
6389 +
6390 +static int ioapic_resume(struct sys_device *dev)
6391 +{
6392 +       struct IO_APIC_route_entry *entry;
6393 +       struct sysfs_ioapic_data *data;
6394 +       unsigned long flags;
6395 +       union IO_APIC_reg_00 reg_00;
6396 +       int i;
6397 +       
6398 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
6399 +       entry = data->entry;
6400 +
6401 +       spin_lock_irqsave(&ioapic_lock, flags);
6402 +       reg_00.raw = io_apic_read(dev->id, 0);
6403 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
6404 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
6405 +               io_apic_write(dev->id, 0, reg_00.raw);
6406 +       }
6407 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6408 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
6409 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
6410 +       }
6411 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6412 +
6413 +       return 0;
6414 +}
6415 +
6416 +static struct sysdev_class ioapic_sysdev_class = {
6417 +       set_kset_name("ioapic"),
6418 +       .suspend = ioapic_suspend,
6419 +       .resume = ioapic_resume,
6420 +};
6421 +
6422 +static int __init ioapic_init_sysfs(void)
6423 +{
6424 +       struct sys_device * dev;
6425 +       int i, size, error = 0;
6426 +
6427 +       error = sysdev_class_register(&ioapic_sysdev_class);
6428 +       if (error)
6429 +               return error;
6430 +
6431 +       for (i = 0; i < nr_ioapics; i++ ) {
6432 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
6433 +                       * sizeof(struct IO_APIC_route_entry);
6434 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
6435 +               if (!mp_ioapic_data[i]) {
6436 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6437 +                       continue;
6438 +               }
6439 +               memset(mp_ioapic_data[i], 0, size);
6440 +               dev = &mp_ioapic_data[i]->dev;
6441 +               dev->id = i; 
6442 +               dev->cls = &ioapic_sysdev_class;
6443 +               error = sysdev_register(dev);
6444 +               if (error) {
6445 +                       kfree(mp_ioapic_data[i]);
6446 +                       mp_ioapic_data[i] = NULL;
6447 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6448 +                       continue;
6449 +               }
6450 +       }
6451 +
6452 +       return 0;
6453 +}
6454 +
6455 +device_initcall(ioapic_init_sysfs);
6456 +
6457 +/* --------------------------------------------------------------------------
6458 +                          ACPI-based IOAPIC Configuration
6459 +   -------------------------------------------------------------------------- */
6460 +
6461 +#ifdef CONFIG_ACPI
6462 +
6463 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
6464 +{
6465 +#ifndef CONFIG_XEN
6466 +       union IO_APIC_reg_00 reg_00;
6467 +       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
6468 +       physid_mask_t tmp;
6469 +       unsigned long flags;
6470 +       int i = 0;
6471 +
6472 +       /*
6473 +        * The P4 platform supports up to 256 APIC IDs on two separate APIC 
6474 +        * buses (one for LAPICs, one for IOAPICs), where predecessors only 
6475 +        * supports up to 16 on one shared APIC bus.
6476 +        * 
6477 +        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
6478 +        *      advantage of new APIC bus architecture.
6479 +        */
6480 +
6481 +       if (physids_empty(apic_id_map))
6482 +               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
6483 +
6484 +       spin_lock_irqsave(&ioapic_lock, flags);
6485 +       reg_00.raw = io_apic_read(ioapic, 0);
6486 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6487 +
6488 +       if (apic_id >= get_physical_broadcast()) {
6489 +               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
6490 +                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
6491 +               apic_id = reg_00.bits.ID;
6492 +       }
6493 +
6494 +       /*
6495 +        * Every APIC in a system must have a unique ID or we get lots of nice 
6496 +        * 'stuck on smp_invalidate_needed IPI wait' messages.
6497 +        */
6498 +       if (check_apicid_used(apic_id_map, apic_id)) {
6499 +
6500 +               for (i = 0; i < get_physical_broadcast(); i++) {
6501 +                       if (!check_apicid_used(apic_id_map, i))
6502 +                               break;
6503 +               }
6504 +
6505 +               if (i == get_physical_broadcast())
6506 +                       panic("Max apic_id exceeded!\n");
6507 +
6508 +               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
6509 +                       "trying %d\n", ioapic, apic_id, i);
6510 +
6511 +               apic_id = i;
6512 +       } 
6513 +
6514 +       tmp = apicid_to_cpu_present(apic_id);
6515 +       physids_or(apic_id_map, apic_id_map, tmp);
6516 +
6517 +       if (reg_00.bits.ID != apic_id) {
6518 +               reg_00.bits.ID = apic_id;
6519 +
6520 +               spin_lock_irqsave(&ioapic_lock, flags);
6521 +               io_apic_write(ioapic, 0, reg_00.raw);
6522 +               reg_00.raw = io_apic_read(ioapic, 0);
6523 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6524 +
6525 +               /* Sanity check */
6526 +               if (reg_00.bits.ID != apic_id) {
6527 +                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
6528 +                       return -1;
6529 +               }
6530 +       }
6531 +
6532 +       apic_printk(APIC_VERBOSE, KERN_INFO
6533 +                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
6534 +#endif /* !CONFIG_XEN */
6535 +
6536 +       return apic_id;
6537 +}
6538 +
6539 +
6540 +int __init io_apic_get_version (int ioapic)
6541 +{
6542 +       union IO_APIC_reg_01    reg_01;
6543 +       unsigned long flags;
6544 +
6545 +       spin_lock_irqsave(&ioapic_lock, flags);
6546 +       reg_01.raw = io_apic_read(ioapic, 1);
6547 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6548 +
6549 +       return reg_01.bits.version;
6550 +}
6551 +
6552 +
6553 +int __init io_apic_get_redir_entries (int ioapic)
6554 +{
6555 +       union IO_APIC_reg_01    reg_01;
6556 +       unsigned long flags;
6557 +
6558 +       spin_lock_irqsave(&ioapic_lock, flags);
6559 +       reg_01.raw = io_apic_read(ioapic, 1);
6560 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6561 +
6562 +       return reg_01.bits.entries;
6563 +}
6564 +
6565 +
6566 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
6567 +{
6568 +       struct IO_APIC_route_entry entry;
6569 +       unsigned long flags;
6570 +
6571 +       if (!IO_APIC_IRQ(irq)) {
6572 +               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
6573 +                       ioapic);
6574 +               return -EINVAL;
6575 +       }
6576 +
6577 +       /*
6578 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
6579 +        * Note that we mask (disable) IRQs now -- these get enabled when the
6580 +        * corresponding device driver registers for this IRQ.
6581 +        */
6582 +
6583 +       memset(&entry,0,sizeof(entry));
6584 +
6585 +       entry.delivery_mode = INT_DELIVERY_MODE;
6586 +       entry.dest_mode = INT_DEST_MODE;
6587 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6588 +       entry.trigger = edge_level;
6589 +       entry.polarity = active_high_low;
6590 +       entry.mask  = 1;
6591 +
6592 +       /*
6593 +        * IRQs < 16 are already in the irq_2_pin[] map
6594 +        */
6595 +       if (irq >= 16)
6596 +               add_pin_to_irq(irq, ioapic, pin);
6597 +
6598 +       entry.vector = assign_irq_vector(irq);
6599 +
6600 +       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
6601 +               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
6602 +               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
6603 +               edge_level, active_high_low);
6604 +
6605 +       ioapic_register_intr(irq, entry.vector, edge_level);
6606 +
6607 +       if (!ioapic && (irq < 16))
6608 +               disable_8259A_irq(irq);
6609 +
6610 +       spin_lock_irqsave(&ioapic_lock, flags);
6611 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
6612 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
6613 +       set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
6614 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6615 +
6616 +       return 0;
6617 +}
6618 +
6619 +#endif /* CONFIG_ACPI */
6620 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/ioport-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/ioport-xen.c
6621 --- ref-linux-2.6.16.9/arch/i386/kernel/ioport-xen.c    1970-01-01 01:00:00.000000000 +0100
6622 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/ioport-xen.c       2006-04-10 00:05:52.000000000 +0200
6623 @@ -0,0 +1,122 @@
6624 +/*
6625 + *     linux/arch/i386/kernel/ioport.c
6626 + *
6627 + * This contains the io-permission bitmap code - written by obz, with changes
6628 + * by Linus.
6629 + */
6630 +
6631 +#include <linux/sched.h>
6632 +#include <linux/kernel.h>
6633 +#include <linux/capability.h>
6634 +#include <linux/errno.h>
6635 +#include <linux/types.h>
6636 +#include <linux/ioport.h>
6637 +#include <linux/smp.h>
6638 +#include <linux/smp_lock.h>
6639 +#include <linux/stddef.h>
6640 +#include <linux/slab.h>
6641 +#include <linux/thread_info.h>
6642 +#include <xen/interface/physdev.h>
6643 +
6644 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
6645 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
6646 +{
6647 +       unsigned long mask;
6648 +       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
6649 +       unsigned int low_index = base & (BITS_PER_LONG-1);
6650 +       int length = low_index + extent;
6651 +
6652 +       if (low_index != 0) {
6653 +               mask = (~0UL << low_index);
6654 +               if (length < BITS_PER_LONG)
6655 +                       mask &= ~(~0UL << length);
6656 +               if (new_value)
6657 +                       *bitmap_base++ |= mask;
6658 +               else
6659 +                       *bitmap_base++ &= ~mask;
6660 +               length -= BITS_PER_LONG;
6661 +       }
6662 +
6663 +       mask = (new_value ? ~0UL : 0UL);
6664 +       while (length >= BITS_PER_LONG) {
6665 +               *bitmap_base++ = mask;
6666 +               length -= BITS_PER_LONG;
6667 +       }
6668 +
6669 +       if (length > 0) {
6670 +               mask = ~(~0UL << length);
6671 +               if (new_value)
6672 +                       *bitmap_base++ |= mask;
6673 +               else
6674 +                       *bitmap_base++ &= ~mask;
6675 +       }
6676 +}
6677 +
6678 +
6679 +/*
6680 + * this changes the io permissions bitmap in the current task.
6681 + */
6682 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
6683 +{
6684 +       struct thread_struct * t = &current->thread;
6685 +       unsigned long *bitmap;
6686 +       physdev_op_t op;
6687 +
6688 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
6689 +               return -EINVAL;
6690 +       if (turn_on && !capable(CAP_SYS_RAWIO))
6691 +               return -EPERM;
6692 +
6693 +       /*
6694 +        * If it's the first ioperm() call in this thread's lifetime, set the
6695 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
6696 +        * this is why we delay this operation until now:
6697 +        */
6698 +       if (!t->io_bitmap_ptr) {
6699 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6700 +               if (!bitmap)
6701 +                       return -ENOMEM;
6702 +
6703 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
6704 +               t->io_bitmap_ptr = bitmap;
6705 +
6706 +               op.cmd = PHYSDEVOP_SET_IOBITMAP;
6707 +               op.u.set_iobitmap.bitmap   = (char *)bitmap;
6708 +               op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS;
6709 +               HYPERVISOR_physdev_op(&op);
6710 +       }
6711 +
6712 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6713 +
6714 +       return 0;
6715 +}
6716 +
6717 +/*
6718 + * sys_iopl has to be used when you want to access the IO ports
6719 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6720 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
6721 + *
6722 + * Here we just change the eflags value on the stack: we allow
6723 + * only the super-user to do it. This depends on the stack-layout
6724 + * on system-call entry - see also fork() and the signal handling
6725 + * code.
6726 + */
6727 +
6728 +asmlinkage long sys_iopl(unsigned long unused)
6729 +{
6730 +       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6731 +       unsigned int level = regs->ebx;
6732 +       struct thread_struct *t = &current->thread;
6733 +       unsigned int old = (t->iopl >> 12) & 3;
6734 +
6735 +       if (level > 3)
6736 +               return -EINVAL;
6737 +       /* Trying to gain more privileges? */
6738 +       if (level > old) {
6739 +               if (!capable(CAP_SYS_RAWIO))
6740 +                       return -EPERM;
6741 +       }
6742 +       t->iopl = level << 12;
6743 +       set_iopl_mask(t->iopl);
6744 +       return 0;
6745 +}
6746 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/irq-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/irq-xen.c
6747 --- ref-linux-2.6.16.9/arch/i386/kernel/irq-xen.c       1970-01-01 01:00:00.000000000 +0100
6748 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/irq-xen.c  2006-04-10 00:05:52.000000000 +0200
6749 @@ -0,0 +1,306 @@
6750 +/*
6751 + *     linux/arch/i386/kernel/irq.c
6752 + *
6753 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
6754 + *
6755 + * This file contains the lowest level x86-specific interrupt
6756 + * entry, irq-stacks and irq statistics code. All the remaining
6757 + * irq logic is done by the generic kernel/irq/ code and
6758 + * by the x86-specific irq controller code. (e.g. i8259.c and
6759 + * io_apic.c.)
6760 + */
6761 +
6762 +#include <asm/uaccess.h>
6763 +#include <linux/module.h>
6764 +#include <linux/seq_file.h>
6765 +#include <linux/interrupt.h>
6766 +#include <linux/kernel_stat.h>
6767 +#include <linux/notifier.h>
6768 +#include <linux/cpu.h>
6769 +#include <linux/delay.h>
6770 +
6771 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
6772 +EXPORT_PER_CPU_SYMBOL(irq_stat);
6773 +
6774 +#ifndef CONFIG_X86_LOCAL_APIC
6775 +/*
6776 + * 'what should we do if we get a hw irq event on an illegal vector'.
6777 + * each architecture has to answer this themselves.
6778 + */
6779 +void ack_bad_irq(unsigned int irq)
6780 +{
6781 +       printk("unexpected IRQ trap at vector %02x\n", irq);
6782 +}
6783 +#endif
6784 +
6785 +#ifdef CONFIG_4KSTACKS
6786 +/*
6787 + * per-CPU IRQ handling contexts (thread information and stack)
6788 + */
6789 +union irq_ctx {
6790 +       struct thread_info      tinfo;
6791 +       u32                     stack[THREAD_SIZE/sizeof(u32)];
6792 +};
6793 +
6794 +static union irq_ctx *hardirq_ctx[NR_CPUS];
6795 +static union irq_ctx *softirq_ctx[NR_CPUS];
6796 +#endif
6797 +
6798 +/*
6799 + * do_IRQ handles all normal device IRQ's (the special
6800 + * SMP cross-CPU interrupts have their own specific
6801 + * handlers).
6802 + */
6803 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
6804 +{      
6805 +       /* high bit used in ret_from_ code */
6806 +       int irq = ~regs->orig_eax;
6807 +#ifdef CONFIG_4KSTACKS
6808 +       union irq_ctx *curctx, *irqctx;
6809 +       u32 *isp;
6810 +#endif
6811 +
6812 +       irq_enter();
6813 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
6814 +       /* Debugging check for stack overflow: is there less than 1KB free? */
6815 +       {
6816 +               long esp;
6817 +
6818 +               __asm__ __volatile__("andl %%esp,%0" :
6819 +                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
6820 +               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
6821 +                       printk("do_IRQ: stack overflow: %ld\n",
6822 +                               esp - sizeof(struct thread_info));
6823 +                       dump_stack();
6824 +               }
6825 +       }
6826 +#endif
6827 +
6828 +#ifdef CONFIG_4KSTACKS
6829 +
6830 +       curctx = (union irq_ctx *) current_thread_info();
6831 +       irqctx = hardirq_ctx[smp_processor_id()];
6832 +
6833 +       /*
6834 +        * this is where we switch to the IRQ stack. However, if we are
6835 +        * already using the IRQ stack (because we interrupted a hardirq
6836 +        * handler) we can't do that and just have to keep using the
6837 +        * current stack (which is the irq stack already after all)
6838 +        */
6839 +       if (curctx != irqctx) {
6840 +               int arg1, arg2, ebx;
6841 +
6842 +               /* build the stack frame on the IRQ stack */
6843 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6844 +               irqctx->tinfo.task = curctx->tinfo.task;
6845 +               irqctx->tinfo.previous_esp = current_stack_pointer;
6846 +
6847 +               asm volatile(
6848 +                       "       xchgl   %%ebx,%%esp      \n"
6849 +                       "       call    __do_IRQ         \n"
6850 +                       "       movl   %%ebx,%%esp      \n"
6851 +                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
6852 +                       :  "0" (irq),   "1" (regs),  "2" (isp)
6853 +                       : "memory", "cc", "ecx"
6854 +               );
6855 +       } else
6856 +#endif
6857 +               __do_IRQ(irq, regs);
6858 +
6859 +       irq_exit();
6860 +
6861 +       return 1;
6862 +}
6863 +
6864 +#ifdef CONFIG_4KSTACKS
6865 +
6866 +/*
6867 + * These should really be __section__(".bss.page_aligned") as well, but
6868 + * gcc's 3.0 and earlier don't handle that correctly.
6869 + */
6870 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
6871 +               __attribute__((__aligned__(THREAD_SIZE)));
6872 +
6873 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
6874 +               __attribute__((__aligned__(THREAD_SIZE)));
6875 +
6876 +/*
6877 + * allocate per-cpu stacks for hardirq and for softirq processing
6878 + */
6879 +void irq_ctx_init(int cpu)
6880 +{
6881 +       union irq_ctx *irqctx;
6882 +
6883 +       if (hardirq_ctx[cpu])
6884 +               return;
6885 +
6886 +       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
6887 +       irqctx->tinfo.task              = NULL;
6888 +       irqctx->tinfo.exec_domain       = NULL;
6889 +       irqctx->tinfo.cpu               = cpu;
6890 +       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
6891 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
6892 +
6893 +       hardirq_ctx[cpu] = irqctx;
6894 +
6895 +       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
6896 +       irqctx->tinfo.task              = NULL;
6897 +       irqctx->tinfo.exec_domain       = NULL;
6898 +       irqctx->tinfo.cpu               = cpu;
6899 +       irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
6900 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
6901 +
6902 +       softirq_ctx[cpu] = irqctx;
6903 +
6904 +       printk("CPU %u irqstacks, hard=%p soft=%p\n",
6905 +               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
6906 +}
6907 +
6908 +void irq_ctx_exit(int cpu)
6909 +{
6910 +       hardirq_ctx[cpu] = NULL;
6911 +}
6912 +
6913 +extern asmlinkage void __do_softirq(void);
6914 +
6915 +asmlinkage void do_softirq(void)
6916 +{
6917 +       unsigned long flags;
6918 +       struct thread_info *curctx;
6919 +       union irq_ctx *irqctx;
6920 +       u32 *isp;
6921 +
6922 +       if (in_interrupt())
6923 +               return;
6924 +
6925 +       local_irq_save(flags);
6926 +
6927 +       if (local_softirq_pending()) {
6928 +               curctx = current_thread_info();
6929 +               irqctx = softirq_ctx[smp_processor_id()];
6930 +               irqctx->tinfo.task = curctx->task;
6931 +               irqctx->tinfo.previous_esp = current_stack_pointer;
6932 +
6933 +               /* build the stack frame on the softirq stack */
6934 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6935 +
6936 +               asm volatile(
6937 +                       "       xchgl   %%ebx,%%esp     \n"
6938 +                       "       call    __do_softirq    \n"
6939 +                       "       movl    %%ebx,%%esp     \n"
6940 +                       : "=b"(isp)
6941 +                       : "0"(isp)
6942 +                       : "memory", "cc", "edx", "ecx", "eax"
6943 +               );
6944 +       }
6945 +
6946 +       local_irq_restore(flags);
6947 +}
6948 +
6949 +EXPORT_SYMBOL(do_softirq);
6950 +#endif
6951 +
6952 +/*
6953 + * Interrupt statistics:
6954 + */
6955 +
6956 +atomic_t irq_err_count;
6957 +
6958 +/*
6959 + * /proc/interrupts printing:
6960 + */
6961 +
6962 +int show_interrupts(struct seq_file *p, void *v)
6963 +{
6964 +       int i = *(loff_t *) v, j;
6965 +       struct irqaction * action;
6966 +       unsigned long flags;
6967 +
6968 +       if (i == 0) {
6969 +               seq_printf(p, "           ");
6970 +               for_each_online_cpu(j)
6971 +                       seq_printf(p, "CPU%d       ",j);
6972 +               seq_putc(p, '\n');
6973 +       }
6974 +
6975 +       if (i < NR_IRQS) {
6976 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
6977 +               action = irq_desc[i].action;
6978 +               if (!action)
6979 +                       goto skip;
6980 +               seq_printf(p, "%3d: ",i);
6981 +#ifndef CONFIG_SMP
6982 +               seq_printf(p, "%10u ", kstat_irqs(i));
6983 +#else
6984 +               for_each_online_cpu(j)
6985 +                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
6986 +#endif
6987 +               seq_printf(p, " %14s", irq_desc[i].handler->typename);
6988 +               seq_printf(p, "  %s", action->name);
6989 +
6990 +               for (action=action->next; action; action = action->next)
6991 +                       seq_printf(p, ", %s", action->name);
6992 +
6993 +               seq_putc(p, '\n');
6994 +skip:
6995 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
6996 +       } else if (i == NR_IRQS) {
6997 +               seq_printf(p, "NMI: ");
6998 +               for_each_online_cpu(j)
6999 +                       seq_printf(p, "%10u ", nmi_count(j));
7000 +               seq_putc(p, '\n');
7001 +#ifdef CONFIG_X86_LOCAL_APIC
7002 +               seq_printf(p, "LOC: ");
7003 +               for_each_online_cpu(j)
7004 +                       seq_printf(p, "%10u ",
7005 +                               per_cpu(irq_stat,j).apic_timer_irqs);
7006 +               seq_putc(p, '\n');
7007 +#endif
7008 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
7009 +#if defined(CONFIG_X86_IO_APIC)
7010 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
7011 +#endif
7012 +       }
7013 +       return 0;
7014 +}
7015 +
7016 +#ifdef CONFIG_HOTPLUG_CPU
7017 +
7018 +void fixup_irqs(cpumask_t map)
7019 +{
7020 +       unsigned int irq;
7021 +       static int warned;
7022 +
7023 +       for (irq = 0; irq < NR_IRQS; irq++) {
7024 +               cpumask_t mask;
7025 +               if (irq == 2)
7026 +                       continue;
7027 +
7028 +               cpus_and(mask, irq_affinity[irq], map);
7029 +               if (any_online_cpu(mask) == NR_CPUS) {
7030 +                       /*printk("Breaking affinity for irq %i\n", irq);*/
7031 +                       mask = map;
7032 +               }
7033 +               if (irq_desc[irq].handler->set_affinity)
7034 +                       irq_desc[irq].handler->set_affinity(irq, mask);
7035 +               else if (irq_desc[irq].action && !(warned++))
7036 +                       printk("Cannot set affinity for irq %i\n", irq);
7037 +       }
7038 +
7039 +#if 0
7040 +       barrier();
7041 +       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
7042 +          [note the nop - the interrupt-enable boundary on x86 is two
7043 +          instructions from sti] - to flush out pending hardirqs and
7044 +          IPIs. After this point nothing is supposed to reach this CPU." */
7045 +       __asm__ __volatile__("sti; nop; cli");
7046 +       barrier();
7047 +#else
7048 +       /* That doesn't seem sufficient.  Give it 1ms. */
7049 +       local_irq_enable();
7050 +       mdelay(1);
7051 +       local_irq_disable();
7052 +#endif
7053 +}
7054 +#endif
7055 +
7056 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/ldt-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/ldt-xen.c
7057 --- ref-linux-2.6.16.9/arch/i386/kernel/ldt-xen.c       1970-01-01 01:00:00.000000000 +0100
7058 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/ldt-xen.c  2006-04-10 00:05:52.000000000 +0200
7059 @@ -0,0 +1,269 @@
7060 +/*
7061 + * linux/kernel/ldt.c
7062 + *
7063 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
7064 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
7065 + */
7066 +
7067 +#include <linux/errno.h>
7068 +#include <linux/sched.h>
7069 +#include <linux/string.h>
7070 +#include <linux/mm.h>
7071 +#include <linux/smp.h>
7072 +#include <linux/smp_lock.h>
7073 +#include <linux/vmalloc.h>
7074 +#include <linux/slab.h>
7075 +
7076 +#include <asm/uaccess.h>
7077 +#include <asm/system.h>
7078 +#include <asm/ldt.h>
7079 +#include <asm/desc.h>
7080 +#include <asm/mmu_context.h>
7081 +
7082 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
7083 +static void flush_ldt(void *null)
7084 +{
7085 +       if (current->active_mm)
7086 +               load_LDT(&current->active_mm->context);
7087 +}
7088 +#endif
7089 +
7090 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
7091 +{
7092 +       void *oldldt;
7093 +       void *newldt;
7094 +       int oldsize;
7095 +
7096 +       if (mincount <= pc->size)
7097 +               return 0;
7098 +       oldsize = pc->size;
7099 +       mincount = (mincount+511)&(~511);
7100 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
7101 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
7102 +       else
7103 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
7104 +
7105 +       if (!newldt)
7106 +               return -ENOMEM;
7107 +
7108 +       if (oldsize)
7109 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
7110 +       oldldt = pc->ldt;
7111 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
7112 +       pc->ldt = newldt;
7113 +       wmb();
7114 +       pc->size = mincount;
7115 +       wmb();
7116 +
7117 +       if (reload) {
7118 +#ifdef CONFIG_SMP
7119 +               cpumask_t mask;
7120 +               preempt_disable();
7121 +#endif
7122 +               make_pages_readonly(
7123 +                       pc->ldt,
7124 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7125 +                       XENFEAT_writable_descriptor_tables);
7126 +               load_LDT(pc);
7127 +#ifdef CONFIG_SMP
7128 +               mask = cpumask_of_cpu(smp_processor_id());
7129 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
7130 +                       smp_call_function(flush_ldt, NULL, 1, 1);
7131 +               preempt_enable();
7132 +#endif
7133 +       }
7134 +       if (oldsize) {
7135 +               make_pages_writable(
7136 +                       oldldt,
7137 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
7138 +                       XENFEAT_writable_descriptor_tables);
7139 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
7140 +                       vfree(oldldt);
7141 +               else
7142 +                       kfree(oldldt);
7143 +       }
7144 +       return 0;
7145 +}
7146 +
7147 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
7148 +{
7149 +       int err = alloc_ldt(new, old->size, 0);
7150 +       if (err < 0)
7151 +               return err;
7152 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
7153 +       make_pages_readonly(
7154 +               new->ldt,
7155 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7156 +               XENFEAT_writable_descriptor_tables);
7157 +       return 0;
7158 +}
7159 +
7160 +/*
7161 + * we do not have to muck with descriptors here, that is
7162 + * done in switch_mm() as needed.
7163 + */
7164 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
7165 +{
7166 +       struct mm_struct * old_mm;
7167 +       int retval = 0;
7168 +
7169 +       init_MUTEX(&mm->context.sem);
7170 +       mm->context.size = 0;
7171 +       old_mm = current->mm;
7172 +       if (old_mm && old_mm->context.size > 0) {
7173 +               down(&old_mm->context.sem);
7174 +               retval = copy_ldt(&mm->context, &old_mm->context);
7175 +               up(&old_mm->context.sem);
7176 +       }
7177 +       return retval;
7178 +}
7179 +
7180 +/*
7181 + * No need to lock the MM as we are the last user
7182 + */
7183 +void destroy_context(struct mm_struct *mm)
7184 +{
7185 +       if (mm->context.size) {
7186 +               if (mm == current->active_mm)
7187 +                       clear_LDT();
7188 +               make_pages_writable(
7189 +                       mm->context.ldt,
7190 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7191 +                       XENFEAT_writable_descriptor_tables);
7192 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
7193 +                       vfree(mm->context.ldt);
7194 +               else
7195 +                       kfree(mm->context.ldt);
7196 +               mm->context.size = 0;
7197 +       }
7198 +}
7199 +
7200 +static int read_ldt(void __user * ptr, unsigned long bytecount)
7201 +{
7202 +       int err;
7203 +       unsigned long size;
7204 +       struct mm_struct * mm = current->mm;
7205 +
7206 +       if (!mm->context.size)
7207 +               return 0;
7208 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
7209 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
7210 +
7211 +       down(&mm->context.sem);
7212 +       size = mm->context.size*LDT_ENTRY_SIZE;
7213 +       if (size > bytecount)
7214 +               size = bytecount;
7215 +
7216 +       err = 0;
7217 +       if (copy_to_user(ptr, mm->context.ldt, size))
7218 +               err = -EFAULT;
7219 +       up(&mm->context.sem);
7220 +       if (err < 0)
7221 +               goto error_return;
7222 +       if (size != bytecount) {
7223 +               /* zero-fill the rest */
7224 +               if (clear_user(ptr+size, bytecount-size) != 0) {
7225 +                       err = -EFAULT;
7226 +                       goto error_return;
7227 +               }
7228 +       }
7229 +       return bytecount;
7230 +error_return:
7231 +       return err;
7232 +}
7233 +
7234 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
7235 +{
7236 +       int err;
7237 +       unsigned long size;
7238 +       void *address;
7239 +
7240 +       err = 0;
7241 +       address = &default_ldt[0];
7242 +       size = 5*sizeof(struct desc_struct);
7243 +       if (size > bytecount)
7244 +               size = bytecount;
7245 +
7246 +       err = size;
7247 +       if (copy_to_user(ptr, address, size))
7248 +               err = -EFAULT;
7249 +
7250 +       return err;
7251 +}
7252 +
7253 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
7254 +{
7255 +       struct mm_struct * mm = current->mm;
7256 +       __u32 entry_1, entry_2;
7257 +       int error;
7258 +       struct user_desc ldt_info;
7259 +
7260 +       error = -EINVAL;
7261 +       if (bytecount != sizeof(ldt_info))
7262 +               goto out;
7263 +       error = -EFAULT;        
7264 +       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
7265 +               goto out;
7266 +
7267 +       error = -EINVAL;
7268 +       if (ldt_info.entry_number >= LDT_ENTRIES)
7269 +               goto out;
7270 +       if (ldt_info.contents == 3) {
7271 +               if (oldmode)
7272 +                       goto out;
7273 +               if (ldt_info.seg_not_present == 0)
7274 +                       goto out;
7275 +       }
7276 +
7277 +       down(&mm->context.sem);
7278 +       if (ldt_info.entry_number >= mm->context.size) {
7279 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
7280 +               if (error < 0)
7281 +                       goto out_unlock;
7282 +       }
7283 +
7284 +       /* Allow LDTs to be cleared by the user. */
7285 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
7286 +               if (oldmode || LDT_empty(&ldt_info)) {
7287 +                       entry_1 = 0;
7288 +                       entry_2 = 0;
7289 +                       goto install;
7290 +               }
7291 +       }
7292 +
7293 +       entry_1 = LDT_entry_a(&ldt_info);
7294 +       entry_2 = LDT_entry_b(&ldt_info);
7295 +       if (oldmode)
7296 +               entry_2 &= ~(1 << 20);
7297 +
7298 +       /* Install the new entry ...  */
7299 +install:
7300 +       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
7301 +                               entry_1, entry_2);
7302 +
7303 +out_unlock:
7304 +       up(&mm->context.sem);
7305 +out:
7306 +       return error;
7307 +}
7308 +
7309 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
7310 +{
7311 +       int ret = -ENOSYS;
7312 +
7313 +       switch (func) {
7314 +       case 0:
7315 +               ret = read_ldt(ptr, bytecount);
7316 +               break;
7317 +       case 1:
7318 +               ret = write_ldt(ptr, bytecount, 1);
7319 +               break;
7320 +       case 2:
7321 +               ret = read_default_ldt(ptr, bytecount);
7322 +               break;
7323 +       case 0x11:
7324 +               ret = write_ldt(ptr, bytecount, 0);
7325 +               break;
7326 +       }
7327 +       return ret;
7328 +}
7329 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/Makefile tmp-linux-2.6-xen.patch/arch/i386/kernel/Makefile
7330 --- ref-linux-2.6.16.9/arch/i386/kernel/Makefile        2006-04-19 08:10:14.000000000 +0200
7331 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/Makefile   2006-04-10 00:05:52.000000000 +0200
7332 @@ -37,17 +37,26 @@ obj-$(CONFIG_EFI)           += efi.o efi_stub.o
7333  obj-$(CONFIG_DOUBLEFAULT)      += doublefault.o
7334  obj-$(CONFIG_VM86)             += vm86.o
7335  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
7336 +obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o
7337  
7338  EXTRA_AFLAGS   := -traditional
7339  
7340  obj-$(CONFIG_SCx200)           += scx200.o
7341  
7342 +ifdef CONFIG_XEN
7343 +vsyscall_note := vsyscall-note-xen.o
7344 +else
7345 +vsyscall_note := vsyscall-note.o
7346 +endif
7347 +
7348 +VSYSCALL_TYPES-y                       := int80
7349 +VSYSCALL_TYPES-$(CONFIG_X86_SYSENTER)  += sysenter
7350  # vsyscall.o contains the vsyscall DSO images as __initdata.
7351  # We must build both images before we can assemble it.
7352  # Note: kbuild does not track this dependency due to usage of .incbin
7353 -$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so
7354 -targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
7355 -targets += vsyscall-note.o vsyscall.lds
7356 +$(obj)/vsyscall.o: $(foreach F,$(VSYSCALL_TYPES-y),$(obj)/vsyscall-$F.so)
7357 +targets += $(foreach F,$(VSYSCALL_TYPES-y),vsyscall-$F.o vsyscall-$F.so)
7358 +targets += $(vsyscall_note) vsyscall.lds
7359  
7360  # The DSO images are built using a special linker script.
7361  quiet_cmd_syscall = SYSCALL $@
7362 @@ -62,7 +71,7 @@ SYSCFLAGS_vsyscall-int80.so   = $(vsyscall
7363  
7364  $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
7365  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
7366 -                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
7367 +                     $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE
7368         $(call if_changed,syscall)
7369  
7370  # We also create a special relocatable object that should mirror the symbol
7371 @@ -74,5 +83,18 @@ $(obj)/built-in.o: ld_flags += -R $(obj)
7372  
7373  SYSCFLAGS_vsyscall-syms.o = -r
7374  $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
7375 -                       $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
7376 +                       $(foreach F,$(VSYSCALL_TYPES-y),$(obj)/vsyscall-$F.o) \
7377 +                       $(obj)/$(vsyscall_note) FORCE
7378         $(call if_changed,syscall)
7379 +
7380 +ifdef CONFIG_XEN
7381 +include $(srctree)/scripts/Makefile.xen
7382 +
7383 +obj-y += fixup.o
7384 +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
7385 +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
7386 +
7387 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
7388 +obj-y := $(call cherrypickxen, $(obj-y))
7389 +extra-y := $(call cherrypickxen, $(extra-y))
7390 +endif
7391 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/microcode-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/microcode-xen.c
7392 --- ref-linux-2.6.16.9/arch/i386/kernel/microcode-xen.c 1970-01-01 01:00:00.000000000 +0100
7393 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/microcode-xen.c    2006-04-10 00:05:52.000000000 +0200
7394 @@ -0,0 +1,165 @@
7395 +/*
7396 + *     Intel CPU Microcode Update Driver for Linux
7397 + *
7398 + *     Copyright (C) 2000-2004 Tigran Aivazian
7399 + *
7400 + *     This driver allows to upgrade microcode on Intel processors
7401 + *     belonging to IA-32 family - PentiumPro, Pentium II, 
7402 + *     Pentium III, Xeon, Pentium 4, etc.
7403 + *
7404 + *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
7405 + *     Order Number 245472 or free download from:
7406 + *             
7407 + *     http://developer.intel.com/design/pentium4/manuals/245472.htm
7408 + *
7409 + *     For more information, go to http://www.urbanmyth.org/microcode
7410 + *
7411 + *     This program is free software; you can redistribute it and/or
7412 + *     modify it under the terms of the GNU General Public License
7413 + *     as published by the Free Software Foundation; either version
7414 + *     2 of the License, or (at your option) any later version.
7415 + */
7416 +
7417 +//#define DEBUG /* pr_debug */
7418 +#include <linux/capability.h>
7419 +#include <linux/kernel.h>
7420 +#include <linux/init.h>
7421 +#include <linux/sched.h>
7422 +#include <linux/cpumask.h>
7423 +#include <linux/module.h>
7424 +#include <linux/slab.h>
7425 +#include <linux/vmalloc.h>
7426 +#include <linux/miscdevice.h>
7427 +#include <linux/spinlock.h>
7428 +#include <linux/mm.h>
7429 +#include <linux/syscalls.h>
7430 +
7431 +#include <asm/msr.h>
7432 +#include <asm/uaccess.h>
7433 +#include <asm/processor.h>
7434 +
7435 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
7436 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
7437 +MODULE_LICENSE("GPL");
7438 +
7439 +#define MICROCODE_VERSION      "1.14-xen"
7440 +
7441 +#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
7442 +#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
7443 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
7444 +
7445 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
7446 +static DECLARE_MUTEX(microcode_sem);
7447 +
7448 +static void __user *user_buffer;       /* user area microcode data buffer */
7449 +static unsigned int user_buffer_size;  /* it's size */
7450 +                               
7451 +static int microcode_open (struct inode *unused1, struct file *unused2)
7452 +{
7453 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
7454 +}
7455 +
7456 +
7457 +static int do_microcode_update (void)
7458 +{
7459 +       int err;
7460 +       dom0_op_t op;
7461 +
7462 +       err = sys_mlock((unsigned long)user_buffer, user_buffer_size);
7463 +       if (err != 0)
7464 +               return err;
7465 +
7466 +       op.cmd = DOM0_MICROCODE;
7467 +       op.u.microcode.data = user_buffer;
7468 +       op.u.microcode.length = user_buffer_size;
7469 +       err = HYPERVISOR_dom0_op(&op);
7470 +
7471 +       (void)sys_munlock((unsigned long)user_buffer, user_buffer_size);
7472 +
7473 +       return err;
7474 +}
7475 +
7476 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
7477 +{
7478 +       ssize_t ret;
7479 +
7480 +       if (len < DEFAULT_UCODE_TOTALSIZE) {
7481 +               printk(KERN_ERR "microcode: not enough data\n"); 
7482 +               return -EINVAL;
7483 +       }
7484 +
7485 +       if ((len >> PAGE_SHIFT) > num_physpages) {
7486 +               printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
7487 +               return -EINVAL;
7488 +       }
7489 +
7490 +       down(&microcode_sem);
7491 +
7492 +       user_buffer = (void __user *) buf;
7493 +       user_buffer_size = (int) len;
7494 +
7495 +       ret = do_microcode_update();
7496 +       if (!ret)
7497 +               ret = (ssize_t)len;
7498 +
7499 +       up(&microcode_sem);
7500 +
7501 +       return ret;
7502 +}
7503 +
7504 +static int microcode_ioctl (struct inode *inode, struct file *file, 
7505 +               unsigned int cmd, unsigned long arg)
7506 +{
7507 +       switch (cmd) {
7508 +               /* 
7509 +                *  XXX: will be removed after microcode_ctl 
7510 +                *  is updated to ignore failure of this ioctl()
7511 +                */
7512 +               case MICROCODE_IOCFREE:
7513 +                       return 0;
7514 +               default:
7515 +                       return -EINVAL;
7516 +       }
7517 +       return -EINVAL;
7518 +}
7519 +
7520 +static struct file_operations microcode_fops = {
7521 +       .owner          = THIS_MODULE,
7522 +       .write          = microcode_write,
7523 +       .ioctl          = microcode_ioctl,
7524 +       .open           = microcode_open,
7525 +};
7526 +
7527 +static struct miscdevice microcode_dev = {
7528 +       .minor          = MICROCODE_MINOR,
7529 +       .name           = "microcode",
7530 +       .devfs_name     = "cpu/microcode",
7531 +       .fops           = &microcode_fops,
7532 +};
7533 +
7534 +static int __init microcode_init (void)
7535 +{
7536 +       int error;
7537 +
7538 +       error = misc_register(&microcode_dev);
7539 +       if (error) {
7540 +               printk(KERN_ERR
7541 +                       "microcode: can't misc_register on minor=%d\n",
7542 +                       MICROCODE_MINOR);
7543 +               return error;
7544 +       }
7545 +
7546 +       printk(KERN_INFO 
7547 +               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
7548 +       return 0;
7549 +}
7550 +
7551 +static void __exit microcode_exit (void)
7552 +{
7553 +       misc_deregister(&microcode_dev);
7554 +       printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n");
7555 +}
7556 +
7557 +module_init(microcode_init)
7558 +module_exit(microcode_exit)
7559 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
7560 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/mpparse-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/mpparse-xen.c
7561 --- ref-linux-2.6.16.9/arch/i386/kernel/mpparse-xen.c   1970-01-01 01:00:00.000000000 +0100
7562 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/mpparse-xen.c      2006-04-10 00:05:52.000000000 +0200
7563 @@ -0,0 +1,1188 @@
7564 +/*
7565 + *     Intel Multiprocessor Specification 1.1 and 1.4
7566 + *     compliant MP-table parsing routines.
7567 + *
7568 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
7569 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7570 + *
7571 + *     Fixes
7572 + *             Erich Boleyn    :       MP v1.4 and additional changes.
7573 + *             Alan Cox        :       Added EBDA scanning
7574 + *             Ingo Molnar     :       various cleanups and rewrites
7575 + *             Maciej W. Rozycki:      Bits for default MP configurations
7576 + *             Paul Diefenbaugh:       Added full ACPI support
7577 + */
7578 +
7579 +#include <linux/mm.h>
7580 +#include <linux/init.h>
7581 +#include <linux/acpi.h>
7582 +#include <linux/delay.h>
7583 +#include <linux/config.h>
7584 +#include <linux/bootmem.h>
7585 +#include <linux/smp_lock.h>
7586 +#include <linux/kernel_stat.h>
7587 +#include <linux/mc146818rtc.h>
7588 +#include <linux/bitops.h>
7589 +
7590 +#include <asm/smp.h>
7591 +#include <asm/acpi.h>
7592 +#include <asm/mtrr.h>
7593 +#include <asm/mpspec.h>
7594 +#include <asm/io_apic.h>
7595 +
7596 +#include <mach_apic.h>
7597 +#include <mach_mpparse.h>
7598 +#include <bios_ebda.h>
7599 +
7600 +/* Have we found an MP table */
7601 +int smp_found_config;
7602 +unsigned int __initdata maxcpus = NR_CPUS;
7603 +
7604 +#ifdef CONFIG_HOTPLUG_CPU
7605 +#define CPU_HOTPLUG_ENABLED    (1)
7606 +#else
7607 +#define CPU_HOTPLUG_ENABLED    (0)
7608 +#endif
7609 +
7610 +/*
7611 + * Various Linux-internal data structures created from the
7612 + * MP-table.
7613 + */
7614 +int apic_version [MAX_APICS];
7615 +int mp_bus_id_to_type [MAX_MP_BUSSES];
7616 +int mp_bus_id_to_node [MAX_MP_BUSSES];
7617 +int mp_bus_id_to_local [MAX_MP_BUSSES];
7618 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
7619 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
7620 +static int mp_current_pci_id;
7621 +
7622 +/* I/O APIC entries */
7623 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7624 +
7625 +/* # of MP IRQ source entries */
7626 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7627 +
7628 +/* MP IRQ source entries */
7629 +int mp_irq_entries;
7630 +
7631 +int nr_ioapics;
7632 +
7633 +int pic_mode;
7634 +unsigned long mp_lapic_addr;
7635 +
7636 +unsigned int def_to_bigsmp = 0;
7637 +
7638 +/* Processor that is doing the boot up */
7639 +unsigned int boot_cpu_physical_apicid = -1U;
7640 +/* Internal processor count */
7641 +static unsigned int __devinitdata num_processors;
7642 +
7643 +/* Bitmask of physically existing CPUs */
7644 +physid_mask_t phys_cpu_present_map;
7645 +
7646 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
7647 +
7648 +/*
7649 + * Intel MP BIOS table parsing routines:
7650 + */
7651 +
7652 +
7653 +/*
7654 + * Checksum an MP configuration block.
7655 + */
7656 +
7657 +static int __init mpf_checksum(unsigned char *mp, int len)
7658 +{
7659 +       int sum = 0;
7660 +
7661 +       while (len--)
7662 +               sum += *mp++;
7663 +
7664 +       return sum & 0xFF;
7665 +}
7666 +
7667 +/*
7668 + * Have to match translation table entries to main table entries by counter
7669 + * hence the mpc_record variable .... can't see a less disgusting way of
7670 + * doing this ....
7671 + */
7672 +
7673 +static int mpc_record; 
7674 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
7675 +
7676 +#ifdef CONFIG_X86_NUMAQ
7677 +static int MP_valid_apicid(int apicid, int version)
7678 +{
7679 +       return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
7680 +}
7681 +#elif !defined(CONFIG_XEN)
7682 +static int MP_valid_apicid(int apicid, int version)
7683 +{
7684 +       if (version >= 0x14)
7685 +               return apicid < 0xff;
7686 +       else
7687 +               return apicid < 0xf;
7688 +}
7689 +#endif
7690 +
7691 +#ifndef CONFIG_XEN
7692 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
7693 +{
7694 +       int ver, apicid;
7695 +       physid_mask_t phys_cpu;
7696 +       
7697 +       if (!(m->mpc_cpuflag & CPU_ENABLED))
7698 +               return;
7699 +
7700 +       apicid = mpc_apic_id(m, translation_table[mpc_record]);
7701 +
7702 +       if (m->mpc_featureflag&(1<<0))
7703 +               Dprintk("    Floating point unit present.\n");
7704 +       if (m->mpc_featureflag&(1<<7))
7705 +               Dprintk("    Machine Exception supported.\n");
7706 +       if (m->mpc_featureflag&(1<<8))
7707 +               Dprintk("    64 bit compare & exchange supported.\n");
7708 +       if (m->mpc_featureflag&(1<<9))
7709 +               Dprintk("    Internal APIC present.\n");
7710 +       if (m->mpc_featureflag&(1<<11))
7711 +               Dprintk("    SEP present.\n");
7712 +       if (m->mpc_featureflag&(1<<12))
7713 +               Dprintk("    MTRR  present.\n");
7714 +       if (m->mpc_featureflag&(1<<13))
7715 +               Dprintk("    PGE  present.\n");
7716 +       if (m->mpc_featureflag&(1<<14))
7717 +               Dprintk("    MCA  present.\n");
7718 +       if (m->mpc_featureflag&(1<<15))
7719 +               Dprintk("    CMOV  present.\n");
7720 +       if (m->mpc_featureflag&(1<<16))
7721 +               Dprintk("    PAT  present.\n");
7722 +       if (m->mpc_featureflag&(1<<17))
7723 +               Dprintk("    PSE  present.\n");
7724 +       if (m->mpc_featureflag&(1<<18))
7725 +               Dprintk("    PSN  present.\n");
7726 +       if (m->mpc_featureflag&(1<<19))
7727 +               Dprintk("    Cache Line Flush Instruction present.\n");
7728 +       /* 20 Reserved */
7729 +       if (m->mpc_featureflag&(1<<21))
7730 +               Dprintk("    Debug Trace and EMON Store present.\n");
7731 +       if (m->mpc_featureflag&(1<<22))
7732 +               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
7733 +       if (m->mpc_featureflag&(1<<23))
7734 +               Dprintk("    MMX  present.\n");
7735 +       if (m->mpc_featureflag&(1<<24))
7736 +               Dprintk("    FXSR  present.\n");
7737 +       if (m->mpc_featureflag&(1<<25))
7738 +               Dprintk("    XMM  present.\n");
7739 +       if (m->mpc_featureflag&(1<<26))
7740 +               Dprintk("    Willamette New Instructions  present.\n");
7741 +       if (m->mpc_featureflag&(1<<27))
7742 +               Dprintk("    Self Snoop  present.\n");
7743 +       if (m->mpc_featureflag&(1<<28))
7744 +               Dprintk("    HT  present.\n");
7745 +       if (m->mpc_featureflag&(1<<29))
7746 +               Dprintk("    Thermal Monitor present.\n");
7747 +       /* 30, 31 Reserved */
7748 +
7749 +
7750 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
7751 +               Dprintk("    Bootup CPU\n");
7752 +               boot_cpu_physical_apicid = m->mpc_apicid;
7753 +       }
7754 +
7755 +       ver = m->mpc_apicver;
7756 +
7757 +       if (!MP_valid_apicid(apicid, ver)) {
7758 +               printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n",
7759 +                       m->mpc_apicid, MAX_APICS);
7760 +               return;
7761 +       }
7762 +
7763 +       /*
7764 +        * Validate version
7765 +        */
7766 +       if (ver == 0x0) {
7767 +               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
7768 +                               "fixing up to 0x10. (tell your hw vendor)\n",
7769 +                               m->mpc_apicid);
7770 +               ver = 0x10;
7771 +       }
7772 +       apic_version[m->mpc_apicid] = ver;
7773 +
7774 +       phys_cpu = apicid_to_cpu_present(apicid);
7775 +       physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
7776 +
7777 +       if (num_processors >= NR_CPUS) {
7778 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
7779 +                       "  Processor ignored.\n", NR_CPUS);
7780 +               return;
7781 +       }
7782 +
7783 +       if (num_processors >= maxcpus) {
7784 +               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
7785 +                       " Processor ignored.\n", maxcpus);
7786 +               return;
7787 +       }
7788 +
7789 +       cpu_set(num_processors, cpu_possible_map);
7790 +       num_processors++;
7791 +
7792 +       if (CPU_HOTPLUG_ENABLED || (num_processors > 8)) {
7793 +               switch (boot_cpu_data.x86_vendor) {
7794 +               case X86_VENDOR_INTEL:
7795 +                       if (!APIC_XAPIC(ver)) {
7796 +                               def_to_bigsmp = 0;
7797 +                               break;
7798 +                       }
7799 +                       /* If P4 and above fall through */
7800 +               case X86_VENDOR_AMD:
7801 +                       def_to_bigsmp = 1;
7802 +               }
7803 +       }
7804 +       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
7805 +}
7806 +#else
7807 +void __init MP_processor_info (struct mpc_config_processor *m)
7808 +{
7809 +       num_processors++;
7810 +}
7811 +#endif /* CONFIG_XEN */
7812 +
7813 +static void __init MP_bus_info (struct mpc_config_bus *m)
7814 +{
7815 +       char str[7];
7816 +
7817 +       memcpy(str, m->mpc_bustype, 6);
7818 +       str[6] = 0;
7819 +
7820 +       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
7821 +
7822 +       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
7823 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
7824 +       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
7825 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
7826 +       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
7827 +               mpc_oem_pci_bus(m, translation_table[mpc_record]);
7828 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
7829 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
7830 +               mp_current_pci_id++;
7831 +       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
7832 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
7833 +       } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
7834 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
7835 +       } else {
7836 +               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
7837 +       }
7838 +}
7839 +
7840 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
7841 +{
7842 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
7843 +               return;
7844 +
7845 +       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
7846 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
7847 +       if (nr_ioapics >= MAX_IO_APICS) {
7848 +               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
7849 +                       MAX_IO_APICS, nr_ioapics);
7850 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
7851 +       }
7852 +       if (!m->mpc_apicaddr) {
7853 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
7854 +                       " found in MP table, skipping!\n");
7855 +               return;
7856 +       }
7857 +       mp_ioapics[nr_ioapics] = *m;
7858 +       nr_ioapics++;
7859 +}
7860 +
7861 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
7862 +{
7863 +       mp_irqs [mp_irq_entries] = *m;
7864 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
7865 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
7866 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7867 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
7868 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
7869 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
7870 +               panic("Max # of irq sources exceeded!!\n");
7871 +}
7872 +
7873 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
7874 +{
7875 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
7876 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
7877 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7878 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
7879 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
7880 +       /*
7881 +        * Well it seems all SMP boards in existence
7882 +        * use ExtINT/LVT1 == LINT0 and
7883 +        * NMI/LVT2 == LINT1 - the following check
7884 +        * will show us if this assumptions is false.
7885 +        * Until then we do not have to add baggage.
7886 +        */
7887 +       if ((m->mpc_irqtype == mp_ExtINT) &&
7888 +               (m->mpc_destapiclint != 0))
7889 +                       BUG();
7890 +       if ((m->mpc_irqtype == mp_NMI) &&
7891 +               (m->mpc_destapiclint != 1))
7892 +                       BUG();
7893 +}
7894 +
7895 +#ifdef CONFIG_X86_NUMAQ
7896 +static void __init MP_translation_info (struct mpc_config_translation *m)
7897 +{
7898 +       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
7899 +
7900 +       if (mpc_record >= MAX_MPC_ENTRY) 
7901 +               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
7902 +       else
7903 +               translation_table[mpc_record] = m; /* stash this for later */
7904 +       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
7905 +               node_set_online(m->trans_quad);
7906 +}
7907 +
7908 +/*
7909 + * Read/parse the MPC oem tables
7910 + */
7911 +
7912 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
7913 +       unsigned short oemsize)
7914 +{
7915 +       int count = sizeof (*oemtable); /* the header size */
7916 +       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
7917 +       
7918 +       mpc_record = 0;
7919 +       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
7920 +       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
7921 +       {
7922 +               printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
7923 +                       oemtable->oem_signature[0],
7924 +                       oemtable->oem_signature[1],
7925 +                       oemtable->oem_signature[2],
7926 +                       oemtable->oem_signature[3]);
7927 +               return;
7928 +       }
7929 +       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
7930 +       {
7931 +               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
7932 +               return;
7933 +       }
7934 +       while (count < oemtable->oem_length) {
7935 +               switch (*oemptr) {
7936 +                       case MP_TRANSLATION:
7937 +                       {
7938 +                               struct mpc_config_translation *m=
7939 +                                       (struct mpc_config_translation *)oemptr;
7940 +                               MP_translation_info(m);
7941 +                               oemptr += sizeof(*m);
7942 +                               count += sizeof(*m);
7943 +                               ++mpc_record;
7944 +                               break;
7945 +                       }
7946 +                       default:
7947 +                       {
7948 +                               printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
7949 +                               return;
7950 +                       }
7951 +               }
7952 +       }
7953 +}
7954 +
7955 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
7956 +               char *productid)
7957 +{
7958 +       if (strncmp(oem, "IBM NUMA", 8))
7959 +               printk("Warning!  May not be a NUMA-Q system!\n");
7960 +       if (mpc->mpc_oemptr)
7961 +               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
7962 +                               mpc->mpc_oemsize);
7963 +}
7964 +#endif /* CONFIG_X86_NUMAQ */
7965 +
7966 +/*
7967 + * Read/parse the MPC
7968 + */
7969 +
7970 +static int __init smp_read_mpc(struct mp_config_table *mpc)
7971 +{
7972 +       char str[16];
7973 +       char oem[10];
7974 +       int count=sizeof(*mpc);
7975 +       unsigned char *mpt=((unsigned char *)mpc)+count;
7976 +
7977 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
7978 +               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
7979 +                       *(u32 *)mpc->mpc_signature);
7980 +               return 0;
7981 +       }
7982 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
7983 +               printk(KERN_ERR "SMP mptable: checksum error!\n");
7984 +               return 0;
7985 +       }
7986 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
7987 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
7988 +                       mpc->mpc_spec);
7989 +               return 0;
7990 +       }
7991 +       if (!mpc->mpc_lapic) {
7992 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
7993 +               return 0;
7994 +       }
7995 +       memcpy(oem,mpc->mpc_oem,8);
7996 +       oem[8]=0;
7997 +       printk(KERN_INFO "OEM ID: %s ",oem);
7998 +
7999 +       memcpy(str,mpc->mpc_productid,12);
8000 +       str[12]=0;
8001 +       printk("Product ID: %s ",str);
8002 +
8003 +       mps_oem_check(mpc, oem, str);
8004 +
8005 +       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
8006 +
8007 +       /* 
8008 +        * Save the local APIC address (it might be non-default) -- but only
8009 +        * if we're not using ACPI.
8010 +        */
8011 +       if (!acpi_lapic)
8012 +               mp_lapic_addr = mpc->mpc_lapic;
8013 +
8014 +       /*
8015 +        *      Now process the configuration blocks.
8016 +        */
8017 +       mpc_record = 0;
8018 +       while (count < mpc->mpc_length) {
8019 +               switch(*mpt) {
8020 +                       case MP_PROCESSOR:
8021 +                       {
8022 +                               struct mpc_config_processor *m=
8023 +                                       (struct mpc_config_processor *)mpt;
8024 +                               /* ACPI may have already provided this data */
8025 +                               if (!acpi_lapic)
8026 +                                       MP_processor_info(m);
8027 +                               mpt += sizeof(*m);
8028 +                               count += sizeof(*m);
8029 +                               break;
8030 +                       }
8031 +                       case MP_BUS:
8032 +                       {
8033 +                               struct mpc_config_bus *m=
8034 +                                       (struct mpc_config_bus *)mpt;
8035 +                               MP_bus_info(m);
8036 +                               mpt += sizeof(*m);
8037 +                               count += sizeof(*m);
8038 +                               break;
8039 +                       }
8040 +                       case MP_IOAPIC:
8041 +                       {
8042 +                               struct mpc_config_ioapic *m=
8043 +                                       (struct mpc_config_ioapic *)mpt;
8044 +                               MP_ioapic_info(m);
8045 +                               mpt+=sizeof(*m);
8046 +                               count+=sizeof(*m);
8047 +                               break;
8048 +                       }
8049 +                       case MP_INTSRC:
8050 +                       {
8051 +                               struct mpc_config_intsrc *m=
8052 +                                       (struct mpc_config_intsrc *)mpt;
8053 +
8054 +                               MP_intsrc_info(m);
8055 +                               mpt+=sizeof(*m);
8056 +                               count+=sizeof(*m);
8057 +                               break;
8058 +                       }
8059 +                       case MP_LINTSRC:
8060 +                       {
8061 +                               struct mpc_config_lintsrc *m=
8062 +                                       (struct mpc_config_lintsrc *)mpt;
8063 +                               MP_lintsrc_info(m);
8064 +                               mpt+=sizeof(*m);
8065 +                               count+=sizeof(*m);
8066 +                               break;
8067 +                       }
8068 +                       default:
8069 +                       {
8070 +                               count = mpc->mpc_length;
8071 +                               break;
8072 +                       }
8073 +               }
8074 +               ++mpc_record;
8075 +       }
8076 +       clustered_apic_check();
8077 +       if (!num_processors)
8078 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
8079 +       return num_processors;
8080 +}
8081 +
8082 +static int __init ELCR_trigger(unsigned int irq)
8083 +{
8084 +       unsigned int port;
8085 +
8086 +       port = 0x4d0 + (irq >> 3);
8087 +       return (inb(port) >> (irq & 7)) & 1;
8088 +}
8089 +
8090 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
8091 +{
8092 +       struct mpc_config_intsrc intsrc;
8093 +       int i;
8094 +       int ELCR_fallback = 0;
8095 +
8096 +       intsrc.mpc_type = MP_INTSRC;
8097 +       intsrc.mpc_irqflag = 0;                 /* conforming */
8098 +       intsrc.mpc_srcbus = 0;
8099 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
8100 +
8101 +       intsrc.mpc_irqtype = mp_INT;
8102 +
8103 +       /*
8104 +        *  If true, we have an ISA/PCI system with no IRQ entries
8105 +        *  in the MP table. To prevent the PCI interrupts from being set up
8106 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
8107 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
8108 +        *  never be level sensitive, so we simply see if the ELCR agrees.
8109 +        *  If it does, we assume it's valid.
8110 +        */
8111 +       if (mpc_default_type == 5) {
8112 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
8113 +
8114 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
8115 +                       printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
8116 +               else {
8117 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
8118 +                       ELCR_fallback = 1;
8119 +               }
8120 +       }
8121 +
8122 +       for (i = 0; i < 16; i++) {
8123 +               switch (mpc_default_type) {
8124 +               case 2:
8125 +                       if (i == 0 || i == 13)
8126 +                               continue;       /* IRQ0 & IRQ13 not connected */
8127 +                       /* fall through */
8128 +               default:
8129 +                       if (i == 2)
8130 +                               continue;       /* IRQ2 is never connected */
8131 +               }
8132 +
8133 +               if (ELCR_fallback) {
8134 +                       /*
8135 +                        *  If the ELCR indicates a level-sensitive interrupt, we
8136 +                        *  copy that information over to the MP table in the
8137 +                        *  irqflag field (level sensitive, active high polarity).
8138 +                        */
8139 +                       if (ELCR_trigger(i))
8140 +                               intsrc.mpc_irqflag = 13;
8141 +                       else
8142 +                               intsrc.mpc_irqflag = 0;
8143 +               }
8144 +
8145 +               intsrc.mpc_srcbusirq = i;
8146 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
8147 +               MP_intsrc_info(&intsrc);
8148 +       }
8149 +
8150 +       intsrc.mpc_irqtype = mp_ExtINT;
8151 +       intsrc.mpc_srcbusirq = 0;
8152 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
8153 +       MP_intsrc_info(&intsrc);
8154 +}
8155 +
8156 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
8157 +{
8158 +       struct mpc_config_processor processor;
8159 +       struct mpc_config_bus bus;
8160 +       struct mpc_config_ioapic ioapic;
8161 +       struct mpc_config_lintsrc lintsrc;
8162 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
8163 +       int i;
8164 +
8165 +       /*
8166 +        * local APIC has default address
8167 +        */
8168 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
8169 +
8170 +       /*
8171 +        * 2 CPUs, numbered 0 & 1.
8172 +        */
8173 +       processor.mpc_type = MP_PROCESSOR;
8174 +       /* Either an integrated APIC or a discrete 82489DX. */
8175 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8176 +       processor.mpc_cpuflag = CPU_ENABLED;
8177 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
8178 +                                  (boot_cpu_data.x86_model << 4) |
8179 +                                  boot_cpu_data.x86_mask;
8180 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8181 +       processor.mpc_reserved[0] = 0;
8182 +       processor.mpc_reserved[1] = 0;
8183 +       for (i = 0; i < 2; i++) {
8184 +               processor.mpc_apicid = i;
8185 +               MP_processor_info(&processor);
8186 +       }
8187 +
8188 +       bus.mpc_type = MP_BUS;
8189 +       bus.mpc_busid = 0;
8190 +       switch (mpc_default_type) {
8191 +               default:
8192 +                       printk("???\n");
8193 +                       printk(KERN_ERR "Unknown standard configuration %d\n",
8194 +                               mpc_default_type);
8195 +                       /* fall through */
8196 +               case 1:
8197 +               case 5:
8198 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
8199 +                       break;
8200 +               case 2:
8201 +               case 6:
8202 +               case 3:
8203 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
8204 +                       break;
8205 +               case 4:
8206 +               case 7:
8207 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
8208 +       }
8209 +       MP_bus_info(&bus);
8210 +       if (mpc_default_type > 4) {
8211 +               bus.mpc_busid = 1;
8212 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
8213 +               MP_bus_info(&bus);
8214 +       }
8215 +
8216 +       ioapic.mpc_type = MP_IOAPIC;
8217 +       ioapic.mpc_apicid = 2;
8218 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8219 +       ioapic.mpc_flags = MPC_APIC_USABLE;
8220 +       ioapic.mpc_apicaddr = 0xFEC00000;
8221 +       MP_ioapic_info(&ioapic);
8222 +
8223 +       /*
8224 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
8225 +        */
8226 +       construct_default_ioirq_mptable(mpc_default_type);
8227 +
8228 +       lintsrc.mpc_type = MP_LINTSRC;
8229 +       lintsrc.mpc_irqflag = 0;                /* conforming */
8230 +       lintsrc.mpc_srcbusid = 0;
8231 +       lintsrc.mpc_srcbusirq = 0;
8232 +       lintsrc.mpc_destapic = MP_APIC_ALL;
8233 +       for (i = 0; i < 2; i++) {
8234 +               lintsrc.mpc_irqtype = linttypes[i];
8235 +               lintsrc.mpc_destapiclint = i;
8236 +               MP_lintsrc_info(&lintsrc);
8237 +       }
8238 +}
8239 +
8240 +static struct intel_mp_floating *mpf_found;
8241 +
8242 +/*
8243 + * Scan the memory blocks for an SMP configuration block.
8244 + */
8245 +void __init get_smp_config (void)
8246 +{
8247 +       struct intel_mp_floating *mpf = mpf_found;
8248 +
8249 +       /*
8250 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
8251 +        * processors, where MPS only supports physical.
8252 +        */
8253 +       if (acpi_lapic && acpi_ioapic) {
8254 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
8255 +               return;
8256 +       }
8257 +       else if (acpi_lapic)
8258 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
8259 +
8260 +       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
8261 +       if (mpf->mpf_feature2 & (1<<7)) {
8262 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
8263 +               pic_mode = 1;
8264 +       } else {
8265 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
8266 +               pic_mode = 0;
8267 +       }
8268 +
8269 +       /*
8270 +        * Now see if we need to read further.
8271 +        */
8272 +       if (mpf->mpf_feature1 != 0) {
8273 +
8274 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
8275 +               construct_default_ISA_mptable(mpf->mpf_feature1);
8276 +
8277 +       } else if (mpf->mpf_physptr) {
8278 +
8279 +               /*
8280 +                * Read the physical hardware table.  Anything here will
8281 +                * override the defaults.
8282 +                */
8283 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
8284 +                       smp_found_config = 0;
8285 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
8286 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
8287 +                       return;
8288 +               }
8289 +               /*
8290 +                * If there are no explicit MP IRQ entries, then we are
8291 +                * broken.  We set up most of the low 16 IO-APIC pins to
8292 +                * ISA defaults and hope it will work.
8293 +                */
8294 +               if (!mp_irq_entries) {
8295 +                       struct mpc_config_bus bus;
8296 +
8297 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
8298 +
8299 +                       bus.mpc_type = MP_BUS;
8300 +                       bus.mpc_busid = 0;
8301 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
8302 +                       MP_bus_info(&bus);
8303 +
8304 +                       construct_default_ioirq_mptable(0);
8305 +               }
8306 +
8307 +       } else
8308 +               BUG();
8309 +
8310 +       printk(KERN_INFO "Processors: %d\n", num_processors);
8311 +       /*
8312 +        * Only use the first configuration found.
8313 +        */
8314 +}
8315 +
8316 +static int __init smp_scan_config (unsigned long base, unsigned long length)
8317 +{
8318 +       unsigned long *bp = isa_bus_to_virt(base);
8319 +       struct intel_mp_floating *mpf;
8320 +
8321 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
8322 +       if (sizeof(*mpf) != 16)
8323 +               printk("Error: MPF size\n");
8324 +
8325 +       while (length > 0) {
8326 +               mpf = (struct intel_mp_floating *)bp;
8327 +               if ((*bp == SMP_MAGIC_IDENT) &&
8328 +                       (mpf->mpf_length == 1) &&
8329 +                       !mpf_checksum((unsigned char *)bp, 16) &&
8330 +                       ((mpf->mpf_specification == 1)
8331 +                               || (mpf->mpf_specification == 4)) ) {
8332 +
8333 +                       smp_found_config = 1;
8334 +#ifndef CONFIG_XEN
8335 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
8336 +                                               virt_to_phys(mpf));
8337 +                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
8338 +                       if (mpf->mpf_physptr) {
8339 +                               /*
8340 +                                * We cannot access to MPC table to compute
8341 +                                * table size yet, as only few megabytes from
8342 +                                * the bottom is mapped now.
8343 +                                * PC-9800's MPC table places on the very last
8344 +                                * of physical memory; so that simply reserving
8345 +                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
8346 +                                * in reserve_bootmem.
8347 +                                */
8348 +                               unsigned long size = PAGE_SIZE;
8349 +                               unsigned long end = max_low_pfn * PAGE_SIZE;
8350 +                               if (mpf->mpf_physptr + size > end)
8351 +                                       size = end - mpf->mpf_physptr;
8352 +                               reserve_bootmem(mpf->mpf_physptr, size);
8353 +                       }
8354 +#else
8355 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
8356 +                               ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
8357 +#endif
8358 +
8359 +                       mpf_found = mpf;
8360 +                       return 1;
8361 +               }
8362 +               bp += 4;
8363 +               length -= 16;
8364 +       }
8365 +       return 0;
8366 +}
8367 +
8368 +void __init find_smp_config (void)
8369 +{
8370 +#ifndef CONFIG_XEN
8371 +       unsigned int address;
8372 +#endif
8373 +
8374 +       /*
8375 +        * FIXME: Linux assumes you have 640K of base ram..
8376 +        * this continues the error...
8377 +        *
8378 +        * 1) Scan the bottom 1K for a signature
8379 +        * 2) Scan the top 1K of base RAM
8380 +        * 3) Scan the 64K of bios
8381 +        */
8382 +       if (smp_scan_config(0x0,0x400) ||
8383 +               smp_scan_config(639*0x400,0x400) ||
8384 +                       smp_scan_config(0xF0000,0x10000))
8385 +               return;
8386 +       /*
8387 +        * If it is an SMP machine we should know now, unless the
8388 +        * configuration is in an EISA/MCA bus machine with an
8389 +        * extended bios data area.
8390 +        *
8391 +        * there is a real-mode segmented pointer pointing to the
8392 +        * 4K EBDA area at 0x40E, calculate and scan it here.
8393 +        *
8394 +        * NOTE! There are Linux loaders that will corrupt the EBDA
8395 +        * area, and as such this kind of SMP config may be less
8396 +        * trustworthy, simply because the SMP table may have been
8397 +        * stomped on during early boot. These loaders are buggy and
8398 +        * should be fixed.
8399 +        *
8400 +        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
8401 +        */
8402 +
8403 +#ifndef CONFIG_XEN
8404 +       address = get_bios_ebda();
8405 +       if (address)
8406 +               smp_scan_config(address, 0x400);
8407 +#endif
8408 +}
8409 +
8410 +/* --------------------------------------------------------------------------
8411 +                            ACPI-based MP Configuration
8412 +   -------------------------------------------------------------------------- */
8413 +
8414 +#ifdef CONFIG_ACPI
8415 +
8416 +void __init mp_register_lapic_address (
8417 +       u64                     address)
8418 +{
8419 +#ifndef CONFIG_XEN
8420 +       mp_lapic_addr = (unsigned long) address;
8421 +
8422 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
8423 +
8424 +       if (boot_cpu_physical_apicid == -1U)
8425 +               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
8426 +
8427 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
8428 +#endif
8429 +}
8430 +
8431 +
8432 +void __devinit mp_register_lapic (
8433 +       u8                      id, 
8434 +       u8                      enabled)
8435 +{
8436 +       struct mpc_config_processor processor;
8437 +       int                     boot_cpu = 0;
8438 +       
8439 +       if (MAX_APICS - id <= 0) {
8440 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
8441 +                       id, MAX_APICS);
8442 +               return;
8443 +       }
8444 +
8445 +       if (id == boot_cpu_physical_apicid)
8446 +               boot_cpu = 1;
8447 +
8448 +#ifndef CONFIG_XEN
8449 +       processor.mpc_type = MP_PROCESSOR;
8450 +       processor.mpc_apicid = id;
8451 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
8452 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
8453 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
8454 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
8455 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
8456 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8457 +       processor.mpc_reserved[0] = 0;
8458 +       processor.mpc_reserved[1] = 0;
8459 +#endif
8460 +
8461 +       MP_processor_info(&processor);
8462 +}
8463 +
8464 +#ifdef CONFIG_X86_IO_APIC
8465 +
8466 +#define MP_ISA_BUS             0
8467 +#define MP_MAX_IOAPIC_PIN      127
8468 +
8469 +static struct mp_ioapic_routing {
8470 +       int                     apic_id;
8471 +       int                     gsi_base;
8472 +       int                     gsi_end;
8473 +       u32                     pin_programmed[4];
8474 +} mp_ioapic_routing[MAX_IO_APICS];
8475 +
8476 +
8477 +static int mp_find_ioapic (
8478 +       int                     gsi)
8479 +{
8480 +       int                     i = 0;
8481 +
8482 +       /* Find the IOAPIC that manages this GSI. */
8483 +       for (i = 0; i < nr_ioapics; i++) {
8484 +               if ((gsi >= mp_ioapic_routing[i].gsi_base)
8485 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
8486 +                       return i;
8487 +       }
8488 +
8489 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
8490 +
8491 +       return -1;
8492 +}
8493 +       
8494 +
8495 +void __init mp_register_ioapic (
8496 +       u8                      id, 
8497 +       u32                     address,
8498 +       u32                     gsi_base)
8499 +{
8500 +       int                     idx = 0;
8501 +       int                     tmpid;
8502 +
8503 +       if (nr_ioapics >= MAX_IO_APICS) {
8504 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
8505 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
8506 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
8507 +       }
8508 +       if (!address) {
8509 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
8510 +                       " found in MADT table, skipping!\n");
8511 +               return;
8512 +       }
8513 +
8514 +       idx = nr_ioapics++;
8515 +
8516 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
8517 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
8518 +       mp_ioapics[idx].mpc_apicaddr = address;
8519 +
8520 +#ifndef CONFIG_XEN
8521 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
8522 +#endif
8523 +       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15))
8524 +               tmpid = io_apic_get_unique_id(idx, id);
8525 +       else
8526 +               tmpid = id;
8527 +       if (tmpid == -1) {
8528 +               nr_ioapics--;
8529 +               return;
8530 +       }
8531 +       mp_ioapics[idx].mpc_apicid = tmpid;
8532 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
8533 +       
8534 +       /* 
8535 +        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
8536 +        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
8537 +        */
8538 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
8539 +       mp_ioapic_routing[idx].gsi_base = gsi_base;
8540 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
8541 +               io_apic_get_redir_entries(idx);
8542 +
8543 +       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
8544 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
8545 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
8546 +               mp_ioapic_routing[idx].gsi_base,
8547 +               mp_ioapic_routing[idx].gsi_end);
8548 +
8549 +       return;
8550 +}
8551 +
8552 +
8553 +void __init mp_override_legacy_irq (
8554 +       u8                      bus_irq,
8555 +       u8                      polarity, 
8556 +       u8                      trigger, 
8557 +       u32                     gsi)
8558 +{
8559 +       struct mpc_config_intsrc intsrc;
8560 +       int                     ioapic = -1;
8561 +       int                     pin = -1;
8562 +
8563 +       /* 
8564 +        * Convert 'gsi' to 'ioapic.pin'.
8565 +        */
8566 +       ioapic = mp_find_ioapic(gsi);
8567 +       if (ioapic < 0)
8568 +               return;
8569 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
8570 +
8571 +       /*
8572 +        * TBD: This check is for faulty timer entries, where the override
8573 +        *      erroneously sets the trigger to level, resulting in a HUGE 
8574 +        *      increase of timer interrupts!
8575 +        */
8576 +       if ((bus_irq == 0) && (trigger == 3))
8577 +               trigger = 1;
8578 +
8579 +       intsrc.mpc_type = MP_INTSRC;
8580 +       intsrc.mpc_irqtype = mp_INT;
8581 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
8582 +       intsrc.mpc_srcbus = MP_ISA_BUS;
8583 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
8584 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
8585 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
8586 +
8587 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
8588 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
8589 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
8590 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
8591 +
8592 +       mp_irqs[mp_irq_entries] = intsrc;
8593 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
8594 +               panic("Max # of irq sources exceeded!\n");
8595 +
8596 +       return;
8597 +}
8598 +
8599 +int es7000_plat;
8600 +
8601 +void __init mp_config_acpi_legacy_irqs (void)
8602 +{
8603 +       struct mpc_config_intsrc intsrc;
8604 +       int                     i = 0;
8605 +       int                     ioapic = -1;
8606 +
8607 +       /* 
8608 +        * Fabricate the legacy ISA bus (bus #31).
8609 +        */
8610 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
8611 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
8612 +
8613 +       /*
8614 +        * Older generations of ES7000 have no legacy identity mappings
8615 +        */
8616 +       if (es7000_plat == 1)
8617 +               return;
8618 +
8619 +       /* 
8620 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
8621 +        */
8622 +       ioapic = mp_find_ioapic(0);
8623 +       if (ioapic < 0)
8624 +               return;
8625 +
8626 +       intsrc.mpc_type = MP_INTSRC;
8627 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
8628 +       intsrc.mpc_srcbus = MP_ISA_BUS;
8629 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
8630 +
8631 +       /* 
8632 +        * Use the default configuration for the IRQs 0-15.  Unless
8633 +        * overriden by (MADT) interrupt source override entries.
8634 +        */
8635 +       for (i = 0; i < 16; i++) {
8636 +               int idx;
8637 +
8638 +               for (idx = 0; idx < mp_irq_entries; idx++) {
8639 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
8640 +
8641 +                       /* Do we already have a mapping for this ISA IRQ? */
8642 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
8643 +                               break;
8644 +
8645 +                       /* Do we already have a mapping for this IOAPIC pin */
8646 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
8647 +                               (irq->mpc_dstirq == i))
8648 +                               break;
8649 +               }
8650 +
8651 +               if (idx != mp_irq_entries) {
8652 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
8653 +                       continue;                       /* IRQ already used */
8654 +               }
8655 +
8656 +               intsrc.mpc_irqtype = mp_INT;
8657 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
8658 +               intsrc.mpc_dstirq = i;
8659 +
8660 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
8661 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
8662 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
8663 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
8664 +                       intsrc.mpc_dstirq);
8665 +
8666 +               mp_irqs[mp_irq_entries] = intsrc;
8667 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
8668 +                       panic("Max # of irq sources exceeded!\n");
8669 +       }
8670 +}
8671 +
8672 +#define MAX_GSI_NUM    4096
8673 +
8674 +int mp_register_gsi (u32 gsi, int triggering, int polarity)
8675 +{
8676 +       int                     ioapic = -1;
8677 +       int                     ioapic_pin = 0;
8678 +       int                     idx, bit = 0;
8679 +       static int              pci_irq = 16;
8680 +       /*
8681 +        * Mapping between Global System Interrups, which
8682 +        * represent all possible interrupts, and IRQs
8683 +        * assigned to actual devices.
8684 +        */
8685 +       static int              gsi_to_irq[MAX_GSI_NUM];
8686 +
8687 +       /* Don't set up the ACPI SCI because it's already set up */
8688 +       if (acpi_fadt.sci_int == gsi)
8689 +               return gsi;
8690 +
8691 +       ioapic = mp_find_ioapic(gsi);
8692 +       if (ioapic < 0) {
8693 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
8694 +               return gsi;
8695 +       }
8696 +
8697 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
8698 +
8699 +       if (ioapic_renumber_irq)
8700 +               gsi = ioapic_renumber_irq(ioapic, gsi);
8701 +
8702 +       /* 
8703 +        * Avoid pin reprogramming.  PRTs typically include entries  
8704 +        * with redundant pin->gsi mappings (but unique PCI devices);
8705 +        * we only program the IOAPIC on the first.
8706 +        */
8707 +       bit = ioapic_pin % 32;
8708 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
8709 +       if (idx > 3) {
8710 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
8711 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
8712 +                       ioapic_pin);
8713 +               return gsi;
8714 +       }
8715 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
8716 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
8717 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
8718 +               return gsi_to_irq[gsi];
8719 +       }
8720 +
8721 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
8722 +
8723 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
8724 +               /*
8725 +                * For PCI devices assign IRQs in order, avoiding gaps
8726 +                * due to unused I/O APIC pins.
8727 +                */
8728 +               int irq = gsi;
8729 +               if (gsi < MAX_GSI_NUM) {
8730 +                       if (gsi > 15)
8731 +                               gsi = pci_irq++;
8732 +                       /*
8733 +                        * Don't assign IRQ used by ACPI SCI
8734 +                        */
8735 +                       if (gsi == acpi_fadt.sci_int)
8736 +                               gsi = pci_irq++;
8737 +                       gsi_to_irq[irq] = gsi;
8738 +               } else {
8739 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
8740 +                       return gsi;
8741 +               }
8742 +       }
8743 +
8744 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
8745 +                   triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
8746 +                   polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
8747 +       return gsi;
8748 +}
8749 +
8750 +#endif /* CONFIG_X86_IO_APIC */
8751 +#endif /* CONFIG_ACPI */
8752 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/pci-dma-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/pci-dma-xen.c
8753 --- ref-linux-2.6.16.9/arch/i386/kernel/pci-dma-xen.c   1970-01-01 01:00:00.000000000 +0100
8754 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/pci-dma-xen.c      2006-04-10 00:05:52.000000000 +0200
8755 @@ -0,0 +1,344 @@
8756 +/*
8757 + * Dynamic DMA mapping support.
8758 + *
8759 + * On i386 there is no hardware dynamic DMA address translation,
8760 + * so consistent alloc/free are merely page allocation/freeing.
8761 + * The rest of the dynamic DMA mapping interface is implemented
8762 + * in asm/pci.h.
8763 + */
8764 +
8765 +#include <linux/types.h>
8766 +#include <linux/mm.h>
8767 +#include <linux/string.h>
8768 +#include <linux/pci.h>
8769 +#include <linux/module.h>
8770 +#include <linux/version.h>
8771 +#include <asm/io.h>
8772 +#include <xen/balloon.h>
8773 +#include <asm/tlbflush.h>
8774 +#include <asm-i386/mach-xen/asm/swiotlb.h>
8775 +#include <asm/bug.h>
8776 +
8777 +#ifdef __x86_64__
8778 +int iommu_merge __read_mostly = 0;
8779 +EXPORT_SYMBOL(iommu_merge);
8780 +
8781 +dma_addr_t bad_dma_address __read_mostly;
8782 +EXPORT_SYMBOL(bad_dma_address);
8783 +
8784 +/* This tells the BIO block layer to assume merging. Default to off
8785 +   because we cannot guarantee merging later. */
8786 +int iommu_bio_merge __read_mostly = 0;
8787 +EXPORT_SYMBOL(iommu_bio_merge);
8788 +
8789 +__init int iommu_setup(char *p)
8790 +{
8791 +    return 1;
8792 +}
8793 +#endif
8794 +
8795 +struct dma_coherent_mem {
8796 +       void            *virt_base;
8797 +       u32             device_base;
8798 +       int             size;
8799 +       int             flags;
8800 +       unsigned long   *bitmap;
8801 +};
8802 +
8803 +#define IOMMU_BUG_ON(test)                             \
8804 +do {                                                   \
8805 +       if (unlikely(test)) {                           \
8806 +               printk(KERN_ALERT "Fatal DMA error! "   \
8807 +                      "Please use 'swiotlb=force'\n"); \
8808 +               BUG();                                  \
8809 +       }                                               \
8810 +} while (0)
8811 +
8812 +int
8813 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8814 +          enum dma_data_direction direction)
8815 +{
8816 +       int i, rc;
8817 +
8818 +       if (direction == DMA_NONE)
8819 +               BUG();
8820 +       WARN_ON(nents == 0 || sg[0].length == 0);
8821 +
8822 +       if (swiotlb) {
8823 +               rc = swiotlb_map_sg(hwdev, sg, nents, direction);
8824 +       } else {
8825 +               for (i = 0; i < nents; i++ ) {
8826 +                       sg[i].dma_address =
8827 +                               page_to_phys(sg[i].page) + sg[i].offset;
8828 +                       sg[i].dma_length  = sg[i].length;
8829 +                       BUG_ON(!sg[i].page);
8830 +                       IOMMU_BUG_ON(address_needs_mapping(
8831 +                               hwdev, sg[i].dma_address));
8832 +               }
8833 +               rc = nents;
8834 +       }
8835 +
8836 +       flush_write_buffers();
8837 +       return rc;
8838 +}
8839 +EXPORT_SYMBOL(dma_map_sg);
8840 +
8841 +void
8842 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8843 +            enum dma_data_direction direction)
8844 +{
8845 +       BUG_ON(direction == DMA_NONE);
8846 +       if (swiotlb)
8847 +               swiotlb_unmap_sg(hwdev, sg, nents, direction);
8848 +}
8849 +EXPORT_SYMBOL(dma_unmap_sg);
8850 +
8851 +dma_addr_t
8852 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
8853 +            size_t size, enum dma_data_direction direction)
8854 +{
8855 +       dma_addr_t dma_addr;
8856 +
8857 +       BUG_ON(direction == DMA_NONE);
8858 +
8859 +       if (swiotlb) {
8860 +               dma_addr = swiotlb_map_page(
8861 +                       dev, page, offset, size, direction);
8862 +       } else {
8863 +               dma_addr = page_to_phys(page) + offset;
8864 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
8865 +       }
8866 +
8867 +       return dma_addr;
8868 +}
8869 +EXPORT_SYMBOL(dma_map_page);
8870 +
8871 +void
8872 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
8873 +              enum dma_data_direction direction)
8874 +{
8875 +       BUG_ON(direction == DMA_NONE);
8876 +       if (swiotlb)
8877 +               swiotlb_unmap_page(dev, dma_address, size, direction);
8878 +}
8879 +EXPORT_SYMBOL(dma_unmap_page);
8880 +
8881 +int
8882 +dma_mapping_error(dma_addr_t dma_addr)
8883 +{
8884 +       if (swiotlb)
8885 +               return swiotlb_dma_mapping_error(dma_addr);
8886 +       return 0;
8887 +}
8888 +EXPORT_SYMBOL(dma_mapping_error);
8889 +
8890 +int
8891 +dma_supported(struct device *dev, u64 mask)
8892 +{
8893 +       if (swiotlb)
8894 +               return swiotlb_dma_supported(dev, mask);
8895 +       /*
8896 +        * By default we'll BUG when an infeasible DMA is requested, and
8897 +        * request swiotlb=force (see IOMMU_BUG_ON).
8898 +        */
8899 +       return 1;
8900 +}
8901 +EXPORT_SYMBOL(dma_supported);
8902 +
8903 +void *dma_alloc_coherent(struct device *dev, size_t size,
8904 +                          dma_addr_t *dma_handle, gfp_t gfp)
8905 +{
8906 +       void *ret;
8907 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8908 +       unsigned int order = get_order(size);
8909 +       unsigned long vstart;
8910 +       /* ignore region specifiers */
8911 +       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
8912 +
8913 +       if (mem) {
8914 +               int page = bitmap_find_free_region(mem->bitmap, mem->size,
8915 +                                                    order);
8916 +               if (page >= 0) {
8917 +                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
8918 +                       ret = mem->virt_base + (page << PAGE_SHIFT);
8919 +                       memset(ret, 0, size);
8920 +                       return ret;
8921 +               }
8922 +               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
8923 +                       return NULL;
8924 +       }
8925 +
8926 +       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
8927 +               gfp |= GFP_DMA;
8928 +
8929 +       vstart = __get_free_pages(gfp, order);
8930 +       ret = (void *)vstart;
8931 +
8932 +       if (ret != NULL) {
8933 +               /* NB. Hardcode 31 address bits for now: aacraid limitation. */
8934 +               if (xen_create_contiguous_region(vstart, order, 31) != 0) {
8935 +                       free_pages(vstart, order);
8936 +                       return NULL;
8937 +               }
8938 +               memset(ret, 0, size);
8939 +               *dma_handle = virt_to_bus(ret);
8940 +       }
8941 +       return ret;
8942 +}
8943 +EXPORT_SYMBOL(dma_alloc_coherent);
8944 +
8945 +void dma_free_coherent(struct device *dev, size_t size,
8946 +                        void *vaddr, dma_addr_t dma_handle)
8947 +{
8948 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8949 +       int order = get_order(size);
8950 +       
8951 +       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
8952 +               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
8953 +
8954 +               bitmap_release_region(mem->bitmap, page, order);
8955 +       } else {
8956 +               xen_destroy_contiguous_region((unsigned long)vaddr, order);
8957 +               free_pages((unsigned long)vaddr, order);
8958 +       }
8959 +}
8960 +EXPORT_SYMBOL(dma_free_coherent);
8961 +
8962 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
8963 +                               dma_addr_t device_addr, size_t size, int flags)
8964 +{
8965 +       void __iomem *mem_base;
8966 +       int pages = size >> PAGE_SHIFT;
8967 +       int bitmap_size = (pages + 31)/32;
8968 +
8969 +       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
8970 +               goto out;
8971 +       if (!size)
8972 +               goto out;
8973 +       if (dev->dma_mem)
8974 +               goto out;
8975 +
8976 +       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
8977 +
8978 +       mem_base = ioremap(bus_addr, size);
8979 +       if (!mem_base)
8980 +               goto out;
8981 +
8982 +       dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
8983 +       if (!dev->dma_mem)
8984 +               goto out;
8985 +       memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
8986 +       dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
8987 +       if (!dev->dma_mem->bitmap)
8988 +               goto free1_out;
8989 +       memset(dev->dma_mem->bitmap, 0, bitmap_size);
8990 +
8991 +       dev->dma_mem->virt_base = mem_base;
8992 +       dev->dma_mem->device_base = device_addr;
8993 +       dev->dma_mem->size = pages;
8994 +       dev->dma_mem->flags = flags;
8995 +
8996 +       if (flags & DMA_MEMORY_MAP)
8997 +               return DMA_MEMORY_MAP;
8998 +
8999 +       return DMA_MEMORY_IO;
9000 +
9001 + free1_out:
9002 +       kfree(dev->dma_mem->bitmap);
9003 + out:
9004 +       return 0;
9005 +}
9006 +EXPORT_SYMBOL(dma_declare_coherent_memory);
9007 +
9008 +void dma_release_declared_memory(struct device *dev)
9009 +{
9010 +       struct dma_coherent_mem *mem = dev->dma_mem;
9011 +       
9012 +       if(!mem)
9013 +               return;
9014 +       dev->dma_mem = NULL;
9015 +       iounmap(mem->virt_base);
9016 +       kfree(mem->bitmap);
9017 +       kfree(mem);
9018 +}
9019 +EXPORT_SYMBOL(dma_release_declared_memory);
9020 +
9021 +void *dma_mark_declared_memory_occupied(struct device *dev,
9022 +                                       dma_addr_t device_addr, size_t size)
9023 +{
9024 +       struct dma_coherent_mem *mem = dev->dma_mem;
9025 +       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
9026 +       int pos, err;
9027 +
9028 +       if (!mem)
9029 +               return ERR_PTR(-EINVAL);
9030 +
9031 +       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
9032 +       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
9033 +       if (err != 0)
9034 +               return ERR_PTR(err);
9035 +       return mem->virt_base + (pos << PAGE_SHIFT);
9036 +}
9037 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
9038 +
9039 +dma_addr_t
9040 +dma_map_single(struct device *dev, void *ptr, size_t size,
9041 +              enum dma_data_direction direction)
9042 +{
9043 +       dma_addr_t dma;
9044 +
9045 +       if (direction == DMA_NONE)
9046 +               BUG();
9047 +       WARN_ON(size == 0);
9048 +
9049 +       if (swiotlb) {
9050 +               dma = swiotlb_map_single(dev, ptr, size, direction);
9051 +       } else {
9052 +               dma = virt_to_bus(ptr);
9053 +               IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size));
9054 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma));
9055 +       }
9056 +
9057 +       flush_write_buffers();
9058 +       return dma;
9059 +}
9060 +EXPORT_SYMBOL(dma_map_single);
9061 +
9062 +void
9063 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
9064 +                enum dma_data_direction direction)
9065 +{
9066 +       if (direction == DMA_NONE)
9067 +               BUG();
9068 +       if (swiotlb)
9069 +               swiotlb_unmap_single(dev, dma_addr, size, direction);
9070 +}
9071 +EXPORT_SYMBOL(dma_unmap_single);
9072 +
9073 +void
9074 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
9075 +                       enum dma_data_direction direction)
9076 +{
9077 +       if (swiotlb)
9078 +               swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
9079 +}
9080 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
9081 +
9082 +void
9083 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
9084 +                           enum dma_data_direction direction)
9085 +{
9086 +       if (swiotlb)
9087 +               swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
9088 +}
9089 +EXPORT_SYMBOL(dma_sync_single_for_device);
9090 +
9091 +/*
9092 + * Local variables:
9093 + *  c-file-style: "linux"
9094 + *  indent-tabs-mode: t
9095 + *  c-indent-level: 8
9096 + *  c-basic-offset: 8
9097 + *  tab-width: 8
9098 + * End:
9099 + */
9100 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/process-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/process-xen.c
9101 --- ref-linux-2.6.16.9/arch/i386/kernel/process-xen.c   1970-01-01 01:00:00.000000000 +0100
9102 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/process-xen.c      2006-04-10 00:05:52.000000000 +0200
9103 @@ -0,0 +1,833 @@
9104 +/*
9105 + *  linux/arch/i386/kernel/process.c
9106 + *
9107 + *  Copyright (C) 1995  Linus Torvalds
9108 + *
9109 + *  Pentium III FXSR, SSE support
9110 + *     Gareth Hughes <gareth@valinux.com>, May 2000
9111 + */
9112 +
9113 +/*
9114 + * This file handles the architecture-dependent parts of process handling..
9115 + */
9116 +
9117 +#include <stdarg.h>
9118 +
9119 +#include <linux/cpu.h>
9120 +#include <linux/errno.h>
9121 +#include <linux/sched.h>
9122 +#include <linux/fs.h>
9123 +#include <linux/kernel.h>
9124 +#include <linux/mm.h>
9125 +#include <linux/elfcore.h>
9126 +#include <linux/smp.h>
9127 +#include <linux/smp_lock.h>
9128 +#include <linux/stddef.h>
9129 +#include <linux/slab.h>
9130 +#include <linux/vmalloc.h>
9131 +#include <linux/user.h>
9132 +#include <linux/a.out.h>
9133 +#include <linux/interrupt.h>
9134 +#include <linux/config.h>
9135 +#include <linux/utsname.h>
9136 +#include <linux/delay.h>
9137 +#include <linux/reboot.h>
9138 +#include <linux/init.h>
9139 +#include <linux/mc146818rtc.h>
9140 +#include <linux/module.h>
9141 +#include <linux/kallsyms.h>
9142 +#include <linux/ptrace.h>
9143 +#include <linux/random.h>
9144 +#include <linux/kprobes.h>
9145 +
9146 +#include <asm/uaccess.h>
9147 +#include <asm/pgtable.h>
9148 +#include <asm/system.h>
9149 +#include <asm/io.h>
9150 +#include <asm/ldt.h>
9151 +#include <asm/processor.h>
9152 +#include <asm/i387.h>
9153 +#include <asm/desc.h>
9154 +#include <asm/vm86.h>
9155 +#ifdef CONFIG_MATH_EMULATION
9156 +#include <asm/math_emu.h>
9157 +#endif
9158 +
9159 +#include <xen/interface/physdev.h>
9160 +#include <xen/interface/vcpu.h>
9161 +
9162 +#include <linux/err.h>
9163 +
9164 +#include <asm/tlbflush.h>
9165 +#include <asm/cpu.h>
9166 +
9167 +#include <asm/tlbflush.h>
9168 +#include <asm/cpu.h>
9169 +
9170 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
9171 +
9172 +static int hlt_counter;
9173 +
9174 +unsigned long boot_option_idle_override = 0;
9175 +EXPORT_SYMBOL(boot_option_idle_override);
9176 +
9177 +/*
9178 + * Return saved PC of a blocked thread.
9179 + */
9180 +unsigned long thread_saved_pc(struct task_struct *tsk)
9181 +{
9182 +       return ((unsigned long *)tsk->thread.esp)[3];
9183 +}
9184 +
9185 +/*
9186 + * Powermanagement idle function, if any..
9187 + */
9188 +void (*pm_idle)(void);
9189 +EXPORT_SYMBOL(pm_idle);
9190 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
9191 +
9192 +void disable_hlt(void)
9193 +{
9194 +       hlt_counter++;
9195 +}
9196 +
9197 +EXPORT_SYMBOL(disable_hlt);
9198 +
9199 +void enable_hlt(void)
9200 +{
9201 +       hlt_counter--;
9202 +}
9203 +
9204 +EXPORT_SYMBOL(enable_hlt);
9205 +
9206 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
9207 +extern void stop_hz_timer(void);
9208 +extern void start_hz_timer(void);
9209 +void xen_idle(void)
9210 +{
9211 +       local_irq_disable();
9212 +
9213 +       if (need_resched())
9214 +               local_irq_enable();
9215 +       else {
9216 +               clear_thread_flag(TIF_POLLING_NRFLAG);
9217 +               smp_mb__after_clear_bit();
9218 +               stop_hz_timer();
9219 +               /* Blocking includes an implicit local_irq_enable(). */
9220 +               HYPERVISOR_block();
9221 +               start_hz_timer();
9222 +               set_thread_flag(TIF_POLLING_NRFLAG);
9223 +       }
9224 +}
9225 +#ifdef CONFIG_APM_MODULE
9226 +EXPORT_SYMBOL(default_idle);
9227 +#endif
9228 +
9229 +#ifdef CONFIG_HOTPLUG_CPU
9230 +extern cpumask_t cpu_initialized;
9231 +static inline void play_dead(void)
9232 +{
9233 +       idle_task_exit();
9234 +       local_irq_disable();
9235 +       cpu_clear(smp_processor_id(), cpu_initialized);
9236 +       preempt_enable_no_resched();
9237 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
9238 +       /* Same as drivers/xen/core/smpboot.c:cpu_bringup(). */
9239 +       cpu_init();
9240 +       touch_softlockup_watchdog();
9241 +       preempt_disable();
9242 +       local_irq_enable();
9243 +}
9244 +#else
9245 +static inline void play_dead(void)
9246 +{
9247 +       BUG();
9248 +}
9249 +#endif /* CONFIG_HOTPLUG_CPU */
9250 +
9251 +/*
9252 + * The idle thread. There's no useful work to be
9253 + * done, so just try to conserve power and have a
9254 + * low exit latency (ie sit in a loop waiting for
9255 + * somebody to say that they'd like to reschedule)
9256 + */
9257 +void cpu_idle(void)
9258 +{
9259 +       int cpu = smp_processor_id();
9260 +
9261 +       set_thread_flag(TIF_POLLING_NRFLAG);
9262 +
9263 +       /* endless idle loop with no priority at all */
9264 +       while (1) {
9265 +               while (!need_resched()) {
9266 +
9267 +                       if (__get_cpu_var(cpu_idle_state))
9268 +                               __get_cpu_var(cpu_idle_state) = 0;
9269 +
9270 +                       rmb();
9271 +
9272 +                       if (cpu_is_offline(cpu))
9273 +                               play_dead();
9274 +
9275 +                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
9276 +                       xen_idle();
9277 +               }
9278 +               preempt_enable_no_resched();
9279 +               schedule();
9280 +               preempt_disable();
9281 +       }
9282 +}
9283 +
9284 +void cpu_idle_wait(void)
9285 +{
9286 +       unsigned int cpu, this_cpu = get_cpu();
9287 +       cpumask_t map;
9288 +
9289 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
9290 +       put_cpu();
9291 +
9292 +       cpus_clear(map);
9293 +       for_each_online_cpu(cpu) {
9294 +               per_cpu(cpu_idle_state, cpu) = 1;
9295 +               cpu_set(cpu, map);
9296 +       }
9297 +
9298 +       __get_cpu_var(cpu_idle_state) = 0;
9299 +
9300 +       wmb();
9301 +       do {
9302 +               ssleep(1);
9303 +               for_each_online_cpu(cpu) {
9304 +                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
9305 +                               cpu_clear(cpu, map);
9306 +               }
9307 +               cpus_and(map, map, cpu_online_map);
9308 +       } while (!cpus_empty(map));
9309 +}
9310 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
9311 +
9312 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
9313 +/* Always use xen_idle() instead. */
9314 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) {}
9315 +
9316 +void show_regs(struct pt_regs * regs)
9317 +{
9318 +       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
9319 +
9320 +       printk("\n");
9321 +       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
9322 +       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
9323 +       print_symbol("EIP is at %s\n", regs->eip);
9324 +
9325 +       if (user_mode(regs))
9326 +               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
9327 +       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
9328 +              regs->eflags, print_tainted(), system_utsname.release,
9329 +              (int)strcspn(system_utsname.version, " "),
9330 +              system_utsname.version);
9331 +       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
9332 +               regs->eax,regs->ebx,regs->ecx,regs->edx);
9333 +       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
9334 +               regs->esi, regs->edi, regs->ebp);
9335 +       printk(" DS: %04x ES: %04x\n",
9336 +               0xffff & regs->xds,0xffff & regs->xes);
9337 +
9338 +       cr0 = read_cr0();
9339 +       cr2 = read_cr2();
9340 +       cr3 = read_cr3();
9341 +       cr4 = read_cr4_safe();
9342 +       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
9343 +       show_trace(NULL, &regs->esp);
9344 +}
9345 +
9346 +/*
9347 + * This gets run with %ebx containing the
9348 + * function to call, and %edx containing
9349 + * the "args".
9350 + */
9351 +extern void kernel_thread_helper(void);
9352 +__asm__(".section .text\n"
9353 +       ".align 4\n"
9354 +       "kernel_thread_helper:\n\t"
9355 +       "movl %edx,%eax\n\t"
9356 +       "pushl %edx\n\t"
9357 +       "call *%ebx\n\t"
9358 +       "pushl %eax\n\t"
9359 +       "call do_exit\n"
9360 +       ".previous");
9361 +
9362 +/*
9363 + * Create a kernel thread
9364 + */
9365 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
9366 +{
9367 +       struct pt_regs regs;
9368 +
9369 +       memset(&regs, 0, sizeof(regs));
9370 +
9371 +       regs.ebx = (unsigned long) fn;
9372 +       regs.edx = (unsigned long) arg;
9373 +
9374 +       regs.xds = __USER_DS;
9375 +       regs.xes = __USER_DS;
9376 +       regs.orig_eax = -1;
9377 +       regs.eip = (unsigned long) kernel_thread_helper;
9378 +       regs.xcs = GET_KERNEL_CS();
9379 +       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
9380 +
9381 +       /* Ok, create the new process.. */
9382 +       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
9383 +}
9384 +EXPORT_SYMBOL(kernel_thread);
9385 +
9386 +/*
9387 + * Free current thread data structures etc..
9388 + */
9389 +void exit_thread(void)
9390 +{
9391 +       struct task_struct *tsk = current;
9392 +       struct thread_struct *t = &tsk->thread;
9393 +
9394 +       /*
9395 +        * Remove function-return probe instances associated with this task
9396 +        * and put them back on the free list. Do not insert an exit probe for
9397 +        * this function, it will be disabled by kprobe_flush_task if you do.
9398 +        */
9399 +       kprobe_flush_task(tsk);
9400 +
9401 +       /* The process may have allocated an io port bitmap... nuke it. */
9402 +       if (unlikely(NULL != t->io_bitmap_ptr)) {
9403 +               physdev_op_t op = { 0 };
9404 +               op.cmd = PHYSDEVOP_SET_IOBITMAP;
9405 +               HYPERVISOR_physdev_op(&op);
9406 +               kfree(t->io_bitmap_ptr);
9407 +               t->io_bitmap_ptr = NULL;
9408 +       }
9409 +}
9410 +
9411 +void flush_thread(void)
9412 +{
9413 +       struct task_struct *tsk = current;
9414 +
9415 +       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
9416 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
9417 +       /*
9418 +        * Forget coprocessor state..
9419 +        */
9420 +       clear_fpu(tsk);
9421 +       clear_used_math();
9422 +}
9423 +
9424 +void release_thread(struct task_struct *dead_task)
9425 +{
9426 +       BUG_ON(dead_task->mm);
9427 +       release_vm86_irqs(dead_task);
9428 +}
9429 +
9430 +/*
9431 + * This gets called before we allocate a new thread and copy
9432 + * the current task into it.
9433 + */
9434 +void prepare_to_copy(struct task_struct *tsk)
9435 +{
9436 +       unlazy_fpu(tsk);
9437 +}
9438 +
9439 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
9440 +       unsigned long unused,
9441 +       struct task_struct * p, struct pt_regs * regs)
9442 +{
9443 +       struct pt_regs * childregs;
9444 +       struct task_struct *tsk;
9445 +       int err;
9446 +
9447 +       childregs = task_pt_regs(p);
9448 +       *childregs = *regs;
9449 +       childregs->eax = 0;
9450 +       childregs->esp = esp;
9451 +
9452 +       p->thread.esp = (unsigned long) childregs;
9453 +       p->thread.esp0 = (unsigned long) (childregs+1);
9454 +
9455 +       p->thread.eip = (unsigned long) ret_from_fork;
9456 +
9457 +       savesegment(fs,p->thread.fs);
9458 +       savesegment(gs,p->thread.gs);
9459 +
9460 +       tsk = current;
9461 +       if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
9462 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
9463 +               if (!p->thread.io_bitmap_ptr) {
9464 +                       p->thread.io_bitmap_max = 0;
9465 +                       return -ENOMEM;
9466 +               }
9467 +               memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
9468 +                       IO_BITMAP_BYTES);
9469 +       }
9470 +
9471 +       /*
9472 +        * Set a new TLS for the child thread?
9473 +        */
9474 +       if (clone_flags & CLONE_SETTLS) {
9475 +               struct desc_struct *desc;
9476 +               struct user_desc info;
9477 +               int idx;
9478 +
9479 +               err = -EFAULT;
9480 +               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
9481 +                       goto out;
9482 +               err = -EINVAL;
9483 +               if (LDT_empty(&info))
9484 +                       goto out;
9485 +
9486 +               idx = info.entry_number;
9487 +               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9488 +                       goto out;
9489 +
9490 +               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9491 +               desc->a = LDT_entry_a(&info);
9492 +               desc->b = LDT_entry_b(&info);
9493 +       }
9494 +
9495 +       p->thread.iopl = current->thread.iopl;
9496 +
9497 +       err = 0;
9498 + out:
9499 +       if (err && p->thread.io_bitmap_ptr) {
9500 +               kfree(p->thread.io_bitmap_ptr);
9501 +               p->thread.io_bitmap_max = 0;
9502 +       }
9503 +       return err;
9504 +}
9505 +
9506 +/*
9507 + * fill in the user structure for a core dump..
9508 + */
9509 +void dump_thread(struct pt_regs * regs, struct user * dump)
9510 +{
9511 +       int i;
9512 +
9513 +/* changed the size calculations - should hopefully work better. lbt */
9514 +       dump->magic = CMAGIC;
9515 +       dump->start_code = 0;
9516 +       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
9517 +       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
9518 +       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
9519 +       dump->u_dsize -= dump->u_tsize;
9520 +       dump->u_ssize = 0;
9521 +       for (i = 0; i < 8; i++)
9522 +               dump->u_debugreg[i] = current->thread.debugreg[i];  
9523 +
9524 +       if (dump->start_stack < TASK_SIZE)
9525 +               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
9526 +
9527 +       dump->regs.ebx = regs->ebx;
9528 +       dump->regs.ecx = regs->ecx;
9529 +       dump->regs.edx = regs->edx;
9530 +       dump->regs.esi = regs->esi;
9531 +       dump->regs.edi = regs->edi;
9532 +       dump->regs.ebp = regs->ebp;
9533 +       dump->regs.eax = regs->eax;
9534 +       dump->regs.ds = regs->xds;
9535 +       dump->regs.es = regs->xes;
9536 +       savesegment(fs,dump->regs.fs);
9537 +       savesegment(gs,dump->regs.gs);
9538 +       dump->regs.orig_eax = regs->orig_eax;
9539 +       dump->regs.eip = regs->eip;
9540 +       dump->regs.cs = regs->xcs;
9541 +       dump->regs.eflags = regs->eflags;
9542 +       dump->regs.esp = regs->esp;
9543 +       dump->regs.ss = regs->xss;
9544 +
9545 +       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
9546 +}
9547 +EXPORT_SYMBOL(dump_thread);
9548 +
9549 +/* 
9550 + * Capture the user space registers if the task is not running (in user space)
9551 + */
9552 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
9553 +{
9554 +       struct pt_regs ptregs = *task_pt_regs(tsk);
9555 +       ptregs.xcs &= 0xffff;
9556 +       ptregs.xds &= 0xffff;
9557 +       ptregs.xes &= 0xffff;
9558 +       ptregs.xss &= 0xffff;
9559 +
9560 +       elf_core_copy_regs(regs, &ptregs);
9561 +
9562 +       return 1;
9563 +}
9564 +
9565 +/*
9566 + * This function selects if the context switch from prev to next
9567 + * has to tweak the TSC disable bit in the cr4.
9568 + */
9569 +static inline void disable_tsc(struct task_struct *prev_p,
9570 +                              struct task_struct *next_p)
9571 +{
9572 +       struct thread_info *prev, *next;
9573 +
9574 +       /*
9575 +        * gcc should eliminate the ->thread_info dereference if
9576 +        * has_secure_computing returns 0 at compile time (SECCOMP=n).
9577 +        */
9578 +       prev = task_thread_info(prev_p);
9579 +       next = task_thread_info(next_p);
9580 +
9581 +       if (has_secure_computing(prev) || has_secure_computing(next)) {
9582 +               /* slow path here */
9583 +               if (has_secure_computing(prev) &&
9584 +                   !has_secure_computing(next)) {
9585 +                       write_cr4(read_cr4() & ~X86_CR4_TSD);
9586 +               } else if (!has_secure_computing(prev) &&
9587 +                          has_secure_computing(next))
9588 +                       write_cr4(read_cr4() | X86_CR4_TSD);
9589 +       }
9590 +}
9591 +
9592 +/*
9593 + *     switch_to(x,yn) should switch tasks from x to y.
9594 + *
9595 + * We fsave/fwait so that an exception goes off at the right time
9596 + * (as a call from the fsave or fwait in effect) rather than to
9597 + * the wrong process. Lazy FP saving no longer makes any sense
9598 + * with modern CPU's, and this simplifies a lot of things (SMP
9599 + * and UP become the same).
9600 + *
9601 + * NOTE! We used to use the x86 hardware context switching. The
9602 + * reason for not using it any more becomes apparent when you
9603 + * try to recover gracefully from saved state that is no longer
9604 + * valid (stale segment register values in particular). With the
9605 + * hardware task-switch, there is no way to fix up bad state in
9606 + * a reasonable manner.
9607 + *
9608 + * The fact that Intel documents the hardware task-switching to
9609 + * be slow is a fairly red herring - this code is not noticeably
9610 + * faster. However, there _is_ some room for improvement here,
9611 + * so the performance issues may eventually be a valid point.
9612 + * More important, however, is the fact that this allows us much
9613 + * more flexibility.
9614 + *
9615 + * The return value (in %eax) will be the "prev" task after
9616 + * the task-switch, and shows up in ret_from_fork in entry.S,
9617 + * for example.
9618 + */
9619 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
9620 +{
9621 +       struct thread_struct *prev = &prev_p->thread,
9622 +                                *next = &next_p->thread;
9623 +       int cpu = smp_processor_id();
9624 +#ifndef CONFIG_X86_NO_TSS
9625 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
9626 +#endif
9627 +       physdev_op_t iopl_op, iobmp_op;
9628 +       multicall_entry_t _mcl[8], *mcl = _mcl;
9629 +
9630 +       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
9631 +
9632 +       /*
9633 +        * This is basically '__unlazy_fpu', except that we queue a
9634 +        * multicall to indicate FPU task switch, rather than
9635 +        * synchronously trapping to Xen.
9636 +        */
9637 +       if (prev_p->thread_info->status & TS_USEDFPU) {
9638 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
9639 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
9640 +               mcl->args[0] = 1;
9641 +               mcl++;
9642 +       }
9643 +#if 0 /* lazy fpu sanity check */
9644 +       else BUG_ON(!(read_cr0() & 8));
9645 +#endif
9646 +
9647 +       /*
9648 +        * Reload esp0.
9649 +        * This is load_esp0(tss, next) with a multicall.
9650 +        */
9651 +       mcl->op      = __HYPERVISOR_stack_switch;
9652 +       mcl->args[0] = __KERNEL_DS;
9653 +       mcl->args[1] = next->esp0;
9654 +       mcl++;
9655 +
9656 +       /*
9657 +        * Load the per-thread Thread-Local Storage descriptor.
9658 +        * This is load_TLS(next, cpu) with multicalls.
9659 +        */
9660 +#define C(i) do {                                                      \
9661 +       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
9662 +                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
9663 +               mcl->op = __HYPERVISOR_update_descriptor;               \
9664 +               *(u64 *)&mcl->args[0] = virt_to_machine(                \
9665 +                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
9666 +               *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];    \
9667 +               mcl++;                                                  \
9668 +       }                                                               \
9669 +} while (0)
9670 +       C(0); C(1); C(2);
9671 +#undef C
9672 +
9673 +       if (unlikely(prev->iopl != next->iopl)) {
9674 +               iopl_op.cmd             = PHYSDEVOP_SET_IOPL;
9675 +               iopl_op.u.set_iopl.iopl = (next->iopl == 0) ? 1 :
9676 +                       (next->iopl >> 12) & 3;
9677 +               mcl->op      = __HYPERVISOR_physdev_op;
9678 +               mcl->args[0] = (unsigned long)&iopl_op;
9679 +               mcl++;
9680 +       }
9681 +
9682 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
9683 +               iobmp_op.cmd                     =
9684 +                       PHYSDEVOP_SET_IOBITMAP;
9685 +               iobmp_op.u.set_iobitmap.bitmap   =
9686 +                       (char *)next->io_bitmap_ptr;
9687 +               iobmp_op.u.set_iobitmap.nr_ports =
9688 +                       next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
9689 +               mcl->op      = __HYPERVISOR_physdev_op;
9690 +               mcl->args[0] = (unsigned long)&iobmp_op;
9691 +               mcl++;
9692 +       }
9693 +
9694 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
9695 +
9696 +       /*
9697 +        * Restore %fs and %gs if needed.
9698 +        *
9699 +        * Glibc normally makes %fs be zero, and %gs is one of
9700 +        * the TLS segments.
9701 +        */
9702 +       if (unlikely(next->fs))
9703 +               loadsegment(fs, next->fs);
9704 +
9705 +       if (next->gs)
9706 +               loadsegment(gs, next->gs);
9707 +
9708 +       /*
9709 +        * Now maybe reload the debug registers
9710 +        */
9711 +       if (unlikely(next->debugreg[7])) {
9712 +               set_debugreg(next->debugreg[0], 0);
9713 +               set_debugreg(next->debugreg[1], 1);
9714 +               set_debugreg(next->debugreg[2], 2);
9715 +               set_debugreg(next->debugreg[3], 3);
9716 +               /* no 4 and 5 */
9717 +               set_debugreg(next->debugreg[6], 6);
9718 +               set_debugreg(next->debugreg[7], 7);
9719 +       }
9720 +
9721 +       disable_tsc(prev_p, next_p);
9722 +
9723 +       return prev_p;
9724 +}
9725 +
9726 +asmlinkage int sys_fork(struct pt_regs regs)
9727 +{
9728 +       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9729 +}
9730 +
9731 +asmlinkage int sys_clone(struct pt_regs regs)
9732 +{
9733 +       unsigned long clone_flags;
9734 +       unsigned long newsp;
9735 +       int __user *parent_tidptr, *child_tidptr;
9736 +
9737 +       clone_flags = regs.ebx;
9738 +       newsp = regs.ecx;
9739 +       parent_tidptr = (int __user *)regs.edx;
9740 +       child_tidptr = (int __user *)regs.edi;
9741 +       if (!newsp)
9742 +               newsp = regs.esp;
9743 +       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
9744 +}
9745 +
9746 +/*
9747 + * This is trivial, and on the face of it looks like it
9748 + * could equally well be done in user mode.
9749 + *
9750 + * Not so, for quite unobvious reasons - register pressure.
9751 + * In user mode vfork() cannot have a stack frame, and if
9752 + * done by calling the "clone()" system call directly, you
9753 + * do not have enough call-clobbered registers to hold all
9754 + * the information you need.
9755 + */
9756 +asmlinkage int sys_vfork(struct pt_regs regs)
9757 +{
9758 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9759 +}
9760 +
9761 +/*
9762 + * sys_execve() executes a new program.
9763 + */
9764 +asmlinkage int sys_execve(struct pt_regs regs)
9765 +{
9766 +       int error;
9767 +       char * filename;
9768 +
9769 +       filename = getname((char __user *) regs.ebx);
9770 +       error = PTR_ERR(filename);
9771 +       if (IS_ERR(filename))
9772 +               goto out;
9773 +       error = do_execve(filename,
9774 +                       (char __user * __user *) regs.ecx,
9775 +                       (char __user * __user *) regs.edx,
9776 +                       &regs);
9777 +       if (error == 0) {
9778 +               task_lock(current);
9779 +               current->ptrace &= ~PT_DTRACE;
9780 +               task_unlock(current);
9781 +               /* Make sure we don't return using sysenter.. */
9782 +               set_thread_flag(TIF_IRET);
9783 +       }
9784 +       putname(filename);
9785 +out:
9786 +       return error;
9787 +}
9788 +
9789 +#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
9790 +#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
9791 +
9792 +unsigned long get_wchan(struct task_struct *p)
9793 +{
9794 +       unsigned long ebp, esp, eip;
9795 +       unsigned long stack_page;
9796 +       int count = 0;
9797 +       if (!p || p == current || p->state == TASK_RUNNING)
9798 +               return 0;
9799 +       stack_page = (unsigned long)task_stack_page(p);
9800 +       esp = p->thread.esp;
9801 +       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
9802 +               return 0;
9803 +       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
9804 +       ebp = *(unsigned long *) esp;
9805 +       do {
9806 +               if (ebp < stack_page || ebp > top_ebp+stack_page)
9807 +                       return 0;
9808 +               eip = *(unsigned long *) (ebp+4);
9809 +               if (!in_sched_functions(eip))
9810 +                       return eip;
9811 +               ebp = *(unsigned long *) ebp;
9812 +       } while (count++ < 16);
9813 +       return 0;
9814 +}
9815 +EXPORT_SYMBOL(get_wchan);
9816 +
9817 +/*
9818 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
9819 + */
9820 +static int get_free_idx(void)
9821 +{
9822 +       struct thread_struct *t = &current->thread;
9823 +       int idx;
9824 +
9825 +       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
9826 +               if (desc_empty(t->tls_array + idx))
9827 +                       return idx + GDT_ENTRY_TLS_MIN;
9828 +       return -ESRCH;
9829 +}
9830 +
9831 +/*
9832 + * Set a given TLS descriptor:
9833 + */
9834 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
9835 +{
9836 +       struct thread_struct *t = &current->thread;
9837 +       struct user_desc info;
9838 +       struct desc_struct *desc;
9839 +       int cpu, idx;
9840 +
9841 +       if (copy_from_user(&info, u_info, sizeof(info)))
9842 +               return -EFAULT;
9843 +       idx = info.entry_number;
9844 +
9845 +       /*
9846 +        * index -1 means the kernel should try to find and
9847 +        * allocate an empty descriptor:
9848 +        */
9849 +       if (idx == -1) {
9850 +               idx = get_free_idx();
9851 +               if (idx < 0)
9852 +                       return idx;
9853 +               if (put_user(idx, &u_info->entry_number))
9854 +                       return -EFAULT;
9855 +       }
9856 +
9857 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9858 +               return -EINVAL;
9859 +
9860 +       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
9861 +
9862 +       /*
9863 +        * We must not get preempted while modifying the TLS.
9864 +        */
9865 +       cpu = get_cpu();
9866 +
9867 +       if (LDT_empty(&info)) {
9868 +               desc->a = 0;
9869 +               desc->b = 0;
9870 +       } else {
9871 +               desc->a = LDT_entry_a(&info);
9872 +               desc->b = LDT_entry_b(&info);
9873 +       }
9874 +       load_TLS(t, cpu);
9875 +
9876 +       put_cpu();
9877 +
9878 +       return 0;
9879 +}
9880 +
9881 +/*
9882 + * Get the current Thread-Local Storage area:
9883 + */
9884 +
9885 +#define GET_BASE(desc) ( \
9886 +       (((desc)->a >> 16) & 0x0000ffff) | \
9887 +       (((desc)->b << 16) & 0x00ff0000) | \
9888 +       ( (desc)->b        & 0xff000000)   )
9889 +
9890 +#define GET_LIMIT(desc) ( \
9891 +       ((desc)->a & 0x0ffff) | \
9892 +        ((desc)->b & 0xf0000) )
9893 +       
9894 +#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
9895 +#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
9896 +#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
9897 +#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
9898 +#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
9899 +#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
9900 +
9901 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
9902 +{
9903 +       struct user_desc info;
9904 +       struct desc_struct *desc;
9905 +       int idx;
9906 +
9907 +       if (get_user(idx, &u_info->entry_number))
9908 +               return -EFAULT;
9909 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9910 +               return -EINVAL;
9911 +
9912 +       memset(&info, 0, sizeof(info));
9913 +
9914 +       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9915 +
9916 +       info.entry_number = idx;
9917 +       info.base_addr = GET_BASE(desc);
9918 +       info.limit = GET_LIMIT(desc);
9919 +       info.seg_32bit = GET_32BIT(desc);
9920 +       info.contents = GET_CONTENTS(desc);
9921 +       info.read_exec_only = !GET_WRITABLE(desc);
9922 +       info.limit_in_pages = GET_LIMIT_PAGES(desc);
9923 +       info.seg_not_present = !GET_PRESENT(desc);
9924 +       info.useable = GET_USEABLE(desc);
9925 +
9926 +       if (copy_to_user(u_info, &info, sizeof(info)))
9927 +               return -EFAULT;
9928 +       return 0;
9929 +}
9930 +
9931 +unsigned long arch_align_stack(unsigned long sp)
9932 +{
9933 +       if (randomize_va_space)
9934 +               sp -= get_random_int() % 8192;
9935 +       return sp & ~0xf;
9936 +}
9937 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/quirks-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/quirks-xen.c
9938 --- ref-linux-2.6.16.9/arch/i386/kernel/quirks-xen.c    1970-01-01 01:00:00.000000000 +0100
9939 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/quirks-xen.c       2006-04-10 00:05:52.000000000 +0200
9940 @@ -0,0 +1,48 @@
9941 +/*
9942 + * This file contains work-arounds for x86 and x86_64 platform bugs.
9943 + */
9944 +#include <linux/config.h>
9945 +#include <linux/pci.h>
9946 +#include <linux/irq.h>
9947 +
9948 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
9949 +
9950 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
9951 +{
9952 +       u8 config, rev;
9953 +       u32 word;
9954 +
9955 +       /* BIOS may enable hardware IRQ balancing for
9956 +        * E7520/E7320/E7525(revision ID 0x9 and below)
9957 +        * based platforms.
9958 +        * Disable SW irqbalance/affinity on those platforms.
9959 +        */
9960 +       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
9961 +       if (rev > 0x9)
9962 +               return;
9963 +
9964 +       printk(KERN_INFO "Intel E7520/7320/7525 detected.");
9965 +
9966 +       /* enable access to config space*/
9967 +       pci_read_config_byte(dev, 0xf4, &config);
9968 +       pci_write_config_byte(dev, 0xf4, config|0x2);
9969 +
9970 +       /* read xTPR register */
9971 +       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
9972 +
9973 +       if (!(word & (1 << 13))) {
9974 +               dom0_op_t op;
9975 +               printk(KERN_INFO "Disabling irq balancing and affinity\n");
9976 +               op.cmd = DOM0_PLATFORM_QUIRK;
9977 +               op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
9978 +               (void)HYPERVISOR_dom0_op(&op);
9979 +       }
9980 +
9981 +       /* put back the original value for config space*/
9982 +       if (!(config & 0x2))
9983 +               pci_write_config_byte(dev, 0xf4, config);
9984 +}
9985 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
9986 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
9987 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
9988 +#endif
9989 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/setup-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/setup-xen.c
9990 --- ref-linux-2.6.16.9/arch/i386/kernel/setup-xen.c     1970-01-01 01:00:00.000000000 +0100
9991 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/setup-xen.c        2006-04-10 00:05:52.000000000 +0200
9992 @@ -0,0 +1,1892 @@
9993 +/*
9994 + *  linux/arch/i386/kernel/setup.c
9995 + *
9996 + *  Copyright (C) 1995  Linus Torvalds
9997 + *
9998 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
9999 + *
10000 + *  Memory region support
10001 + *     David Parsons <orc@pell.chi.il.us>, July-August 1999
10002 + *
10003 + *  Added E820 sanitization routine (removes overlapping memory regions);
10004 + *  Brian Moyle <bmoyle@mvista.com>, February 2001
10005 + *
10006 + * Moved CPU detection code to cpu/${cpu}.c
10007 + *    Patrick Mochel <mochel@osdl.org>, March 2002
10008 + *
10009 + *  Provisions for empty E820 memory regions (reported by certain BIOSes).
10010 + *  Alex Achenbach <xela@slit.de>, December 2002.
10011 + *
10012 + */
10013 +
10014 +/*
10015 + * This file handles the architecture-dependent parts of initialization
10016 + */
10017 +
10018 +#include <linux/config.h>
10019 +#include <linux/sched.h>
10020 +#include <linux/mm.h>
10021 +#include <linux/mmzone.h>
10022 +#include <linux/tty.h>
10023 +#include <linux/ioport.h>
10024 +#include <linux/acpi.h>
10025 +#include <linux/apm_bios.h>
10026 +#include <linux/initrd.h>
10027 +#include <linux/bootmem.h>
10028 +#include <linux/seq_file.h>
10029 +#include <linux/console.h>
10030 +#include <linux/mca.h>
10031 +#include <linux/root_dev.h>
10032 +#include <linux/highmem.h>
10033 +#include <linux/module.h>
10034 +#include <linux/efi.h>
10035 +#include <linux/init.h>
10036 +#include <linux/edd.h>
10037 +#include <linux/nodemask.h>
10038 +#include <linux/kernel.h>
10039 +#include <linux/percpu.h>
10040 +#include <linux/notifier.h>
10041 +#include <linux/kexec.h>
10042 +#include <linux/crash_dump.h>
10043 +#include <linux/dmi.h>
10044 +
10045 +#include <video/edid.h>
10046 +
10047 +#include <asm/apic.h>
10048 +#include <asm/e820.h>
10049 +#include <asm/mpspec.h>
10050 +#include <asm/setup.h>
10051 +#include <asm/arch_hooks.h>
10052 +#include <asm/sections.h>
10053 +#include <asm/io_apic.h>
10054 +#include <asm/ist.h>
10055 +#include <asm/io.h>
10056 +#include <asm/hypervisor.h>
10057 +#include <xen/interface/physdev.h>
10058 +#include <xen/interface/memory.h>
10059 +#include <xen/features.h>
10060 +#include "setup_arch_pre.h"
10061 +#include <bios_ebda.h>
10062 +
10063 +/* Forward Declaration. */
10064 +void __init find_max_pfn(void);
10065 +
10066 +/* Allows setting of maximum possible memory size  */
10067 +static unsigned long xen_override_max_pfn;
10068 +
10069 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
10070 +static struct notifier_block xen_panic_block = {
10071 +       xen_panic_event, NULL, 0 /* try to go last */
10072 +};
10073 +
10074 +extern char hypercall_page[PAGE_SIZE];
10075 +EXPORT_SYMBOL(hypercall_page);
10076 +
10077 +int disable_pse __devinitdata = 0;
10078 +
10079 +/*
10080 + * Machine setup..
10081 + */
10082 +
10083 +#ifdef CONFIG_EFI
10084 +int efi_enabled = 0;
10085 +EXPORT_SYMBOL(efi_enabled);
10086 +#endif
10087 +
10088 +/* cpu data as detected by the assembly code in head.S */
10089 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10090 +/* common cpu data for all cpus */
10091 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10092 +EXPORT_SYMBOL(boot_cpu_data);
10093 +
10094 +unsigned long mmu_cr4_features;
10095 +
10096 +#ifdef CONFIG_ACPI
10097 +       int acpi_disabled = 0;
10098 +#else
10099 +       int acpi_disabled = 1;
10100 +#endif
10101 +EXPORT_SYMBOL(acpi_disabled);
10102 +
10103 +#ifdef CONFIG_ACPI
10104 +int __initdata acpi_force = 0;
10105 +extern acpi_interrupt_flags    acpi_sci_flags;
10106 +#endif
10107 +
10108 +/* for MCA, but anyone else can use it if they want */
10109 +unsigned int machine_id;
10110 +#ifdef CONFIG_MCA
10111 +EXPORT_SYMBOL(machine_id);
10112 +#endif
10113 +unsigned int machine_submodel_id;
10114 +unsigned int BIOS_revision;
10115 +unsigned int mca_pentium_flag;
10116 +
10117 +/* For PCI or other memory-mapped resources */
10118 +unsigned long pci_mem_start = 0x10000000;
10119 +#ifdef CONFIG_PCI
10120 +EXPORT_SYMBOL(pci_mem_start);
10121 +#endif
10122 +
10123 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
10124 +int bootloader_type;
10125 +
10126 +/* user-defined highmem size */
10127 +static unsigned int highmem_pages = -1;
10128 +
10129 +/*
10130 + * Setup options
10131 + */
10132 +struct drive_info_struct { char dummy[32]; } drive_info;
10133 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
10134 +    defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
10135 +EXPORT_SYMBOL(drive_info);
10136 +#endif
10137 +struct screen_info screen_info;
10138 +EXPORT_SYMBOL(screen_info);
10139 +struct apm_info apm_info;
10140 +EXPORT_SYMBOL(apm_info);
10141 +struct sys_desc_table_struct {
10142 +       unsigned short length;
10143 +       unsigned char table[0];
10144 +};
10145 +struct edid_info edid_info;
10146 +EXPORT_SYMBOL_GPL(edid_info);
10147 +struct ist_info ist_info;
10148 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
10149 +       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
10150 +EXPORT_SYMBOL(ist_info);
10151 +#endif
10152 +struct e820map e820;
10153 +
10154 +extern void early_cpu_init(void);
10155 +extern void generic_apic_probe(char *);
10156 +extern int root_mountflags;
10157 +
10158 +unsigned long saved_videomode;
10159 +
10160 +#define RAMDISK_IMAGE_START_MASK       0x07FF
10161 +#define RAMDISK_PROMPT_FLAG            0x8000
10162 +#define RAMDISK_LOAD_FLAG              0x4000  
10163 +
10164 +static char command_line[COMMAND_LINE_SIZE];
10165 +
10166 +unsigned char __initdata boot_params[PARAM_SIZE];
10167 +
10168 +static struct resource data_resource = {
10169 +       .name   = "Kernel data",
10170 +       .start  = 0,
10171 +       .end    = 0,
10172 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10173 +};
10174 +
10175 +static struct resource code_resource = {
10176 +       .name   = "Kernel code",
10177 +       .start  = 0,
10178 +       .end    = 0,
10179 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10180 +};
10181 +
10182 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
10183 +static struct resource system_rom_resource = {
10184 +       .name   = "System ROM",
10185 +       .start  = 0xf0000,
10186 +       .end    = 0xfffff,
10187 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10188 +};
10189 +
10190 +static struct resource extension_rom_resource = {
10191 +       .name   = "Extension ROM",
10192 +       .start  = 0xe0000,
10193 +       .end    = 0xeffff,
10194 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10195 +};
10196 +
10197 +static struct resource adapter_rom_resources[] = { {
10198 +       .name   = "Adapter ROM",
10199 +       .start  = 0xc8000,
10200 +       .end    = 0,
10201 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10202 +}, {
10203 +       .name   = "Adapter ROM",
10204 +       .start  = 0,
10205 +       .end    = 0,
10206 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10207 +}, {
10208 +       .name   = "Adapter ROM",
10209 +       .start  = 0,
10210 +       .end    = 0,
10211 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10212 +}, {
10213 +       .name   = "Adapter ROM",
10214 +       .start  = 0,
10215 +       .end    = 0,
10216 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10217 +}, {
10218 +       .name   = "Adapter ROM",
10219 +       .start  = 0,
10220 +       .end    = 0,
10221 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10222 +}, {
10223 +       .name   = "Adapter ROM",
10224 +       .start  = 0,
10225 +       .end    = 0,
10226 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10227 +} };
10228 +
10229 +#define ADAPTER_ROM_RESOURCES \
10230 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
10231 +
10232 +static struct resource video_rom_resource = {
10233 +       .name   = "Video ROM",
10234 +       .start  = 0xc0000,
10235 +       .end    = 0xc7fff,
10236 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10237 +};
10238 +#endif
10239 +
10240 +static struct resource video_ram_resource = {
10241 +       .name   = "Video RAM area",
10242 +       .start  = 0xa0000,
10243 +       .end    = 0xbffff,
10244 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10245 +};
10246 +
10247 +static struct resource standard_io_resources[] = { {
10248 +       .name   = "dma1",
10249 +       .start  = 0x0000,
10250 +       .end    = 0x001f,
10251 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10252 +}, {
10253 +       .name   = "pic1",
10254 +       .start  = 0x0020,
10255 +       .end    = 0x0021,
10256 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10257 +}, {
10258 +       .name   = "timer0",
10259 +       .start  = 0x0040,
10260 +       .end    = 0x0043,
10261 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10262 +}, {
10263 +       .name   = "timer1",
10264 +       .start  = 0x0050,
10265 +       .end    = 0x0053,
10266 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10267 +}, {
10268 +       .name   = "keyboard",
10269 +       .start  = 0x0060,
10270 +       .end    = 0x006f,
10271 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10272 +}, {
10273 +       .name   = "dma page reg",
10274 +       .start  = 0x0080,
10275 +       .end    = 0x008f,
10276 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10277 +}, {
10278 +       .name   = "pic2",
10279 +       .start  = 0x00a0,
10280 +       .end    = 0x00a1,
10281 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10282 +}, {
10283 +       .name   = "dma2",
10284 +       .start  = 0x00c0,
10285 +       .end    = 0x00df,
10286 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10287 +}, {
10288 +       .name   = "fpu",
10289 +       .start  = 0x00f0,
10290 +       .end    = 0x00ff,
10291 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10292 +} };
10293 +
10294 +#define STANDARD_IO_RESOURCES \
10295 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
10296 +
10297 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
10298 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
10299 +
10300 +static int __init romchecksum(unsigned char *rom, unsigned long length)
10301 +{
10302 +       unsigned char *p, sum = 0;
10303 +
10304 +       for (p = rom; p < rom + length; p++)
10305 +               sum += *p;
10306 +       return sum == 0;
10307 +}
10308 +
10309 +static void __init probe_roms(void)
10310 +{
10311 +       unsigned long start, length, upper;
10312 +       unsigned char *rom;
10313 +       int           i;
10314 +
10315 +       /* Nothing to do if not running in dom0. */
10316 +       if (!(xen_start_info->flags & SIF_INITDOMAIN))
10317 +               return;
10318 +
10319 +       /* video rom */
10320 +       upper = adapter_rom_resources[0].start;
10321 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
10322 +               rom = isa_bus_to_virt(start);
10323 +               if (!romsignature(rom))
10324 +                       continue;
10325 +
10326 +               video_rom_resource.start = start;
10327 +
10328 +               /* 0 < length <= 0x7f * 512, historically */
10329 +               length = rom[2] * 512;
10330 +
10331 +               /* if checksum okay, trust length byte */
10332 +               if (length && romchecksum(rom, length))
10333 +                       video_rom_resource.end = start + length - 1;
10334 +
10335 +               request_resource(&iomem_resource, &video_rom_resource);
10336 +               break;
10337 +       }
10338 +
10339 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
10340 +       if (start < upper)
10341 +               start = upper;
10342 +
10343 +       /* system rom */
10344 +       request_resource(&iomem_resource, &system_rom_resource);
10345 +       upper = system_rom_resource.start;
10346 +
10347 +       /* check for extension rom (ignore length byte!) */
10348 +       rom = isa_bus_to_virt(extension_rom_resource.start);
10349 +       if (romsignature(rom)) {
10350 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
10351 +               if (romchecksum(rom, length)) {
10352 +                       request_resource(&iomem_resource, &extension_rom_resource);
10353 +                       upper = extension_rom_resource.start;
10354 +               }
10355 +       }
10356 +
10357 +       /* check for adapter roms on 2k boundaries */
10358 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
10359 +               rom = isa_bus_to_virt(start);
10360 +               if (!romsignature(rom))
10361 +                       continue;
10362 +
10363 +               /* 0 < length <= 0x7f * 512, historically */
10364 +               length = rom[2] * 512;
10365 +
10366 +               /* but accept any length that fits if checksum okay */
10367 +               if (!length || start + length > upper || !romchecksum(rom, length))
10368 +                       continue;
10369 +
10370 +               adapter_rom_resources[i].start = start;
10371 +               adapter_rom_resources[i].end = start + length - 1;
10372 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
10373 +
10374 +               start = adapter_rom_resources[i++].end & ~2047UL;
10375 +       }
10376 +}
10377 +#endif
10378 +
10379 +/*
10380 + * Point at the empty zero page to start with. We map the real shared_info
10381 + * page as soon as fixmap is up and running.
10382 + */
10383 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
10384 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
10385 +
10386 +unsigned long *phys_to_machine_mapping;
10387 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
10388 +EXPORT_SYMBOL(phys_to_machine_mapping);
10389 +
10390 +/* Raw start-of-day parameters from the hypervisor. */
10391 +start_info_t *xen_start_info;
10392 +EXPORT_SYMBOL(xen_start_info);
10393 +
10394 +static void __init limit_regions(unsigned long long size)
10395 +{
10396 +       unsigned long long current_addr = 0;
10397 +       int i;
10398 +
10399 +       if (efi_enabled) {
10400 +               efi_memory_desc_t *md;
10401 +               void *p;
10402 +
10403 +               for (p = memmap.map, i = 0; p < memmap.map_end;
10404 +                       p += memmap.desc_size, i++) {
10405 +                       md = p;
10406 +                       current_addr = md->phys_addr + (md->num_pages << 12);
10407 +                       if (md->type == EFI_CONVENTIONAL_MEMORY) {
10408 +                               if (current_addr >= size) {
10409 +                                       md->num_pages -=
10410 +                                               (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
10411 +                                       memmap.nr_map = i + 1;
10412 +                                       return;
10413 +                               }
10414 +                       }
10415 +               }
10416 +       }
10417 +       for (i = 0; i < e820.nr_map; i++) {
10418 +               current_addr = e820.map[i].addr + e820.map[i].size;
10419 +               if (current_addr < size)
10420 +                       continue;
10421 +
10422 +               if (e820.map[i].type != E820_RAM)
10423 +                       continue;
10424 +
10425 +               if (e820.map[i].addr >= size) {
10426 +                       /*
10427 +                        * This region starts past the end of the
10428 +                        * requested size, skip it completely.
10429 +                        */
10430 +                       e820.nr_map = i;
10431 +               } else {
10432 +                       e820.nr_map = i + 1;
10433 +                       e820.map[i].size -= current_addr - size;
10434 +               }
10435 +               return;
10436 +       }
10437 +}
10438 +
10439 +static void __init add_memory_region(unsigned long long start,
10440 +                                  unsigned long long size, int type)
10441 +{
10442 +       int x;
10443 +
10444 +       if (!efi_enabled) {
10445 +                       x = e820.nr_map;
10446 +
10447 +               if (x == E820MAX) {
10448 +                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
10449 +                   return;
10450 +               }
10451 +
10452 +               e820.map[x].addr = start;
10453 +               e820.map[x].size = size;
10454 +               e820.map[x].type = type;
10455 +               e820.nr_map++;
10456 +       }
10457 +} /* add_memory_region */
10458 +
10459 +#define E820_DEBUG     1
10460 +
10461 +static void __init print_memory_map(char *who)
10462 +{
10463 +       int i;
10464 +
10465 +       for (i = 0; i < e820.nr_map; i++) {
10466 +               printk(" %s: %016Lx - %016Lx ", who,
10467 +                       e820.map[i].addr,
10468 +                       e820.map[i].addr + e820.map[i].size);
10469 +               switch (e820.map[i].type) {
10470 +               case E820_RAM:  printk("(usable)\n");
10471 +                               break;
10472 +               case E820_RESERVED:
10473 +                               printk("(reserved)\n");
10474 +                               break;
10475 +               case E820_ACPI:
10476 +                               printk("(ACPI data)\n");
10477 +                               break;
10478 +               case E820_NVS:
10479 +                               printk("(ACPI NVS)\n");
10480 +                               break;
10481 +               default:        printk("type %lu\n", e820.map[i].type);
10482 +                               break;
10483 +               }
10484 +       }
10485 +}
10486 +
10487 +#if 0
10488 +/*
10489 + * Sanitize the BIOS e820 map.
10490 + *
10491 + * Some e820 responses include overlapping entries.  The following 
10492 + * replaces the original e820 map with a new one, removing overlaps.
10493 + *
10494 + */
10495 +struct change_member {
10496 +       struct e820entry *pbios; /* pointer to original bios entry */
10497 +       unsigned long long addr; /* address for this change point */
10498 +};
10499 +static struct change_member change_point_list[2*E820MAX] __initdata;
10500 +static struct change_member *change_point[2*E820MAX] __initdata;
10501 +static struct e820entry *overlap_list[E820MAX] __initdata;
10502 +static struct e820entry new_bios[E820MAX] __initdata;
10503 +
10504 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
10505 +{
10506 +       struct change_member *change_tmp;
10507 +       unsigned long current_type, last_type;
10508 +       unsigned long long last_addr;
10509 +       int chgidx, still_changing;
10510 +       int overlap_entries;
10511 +       int new_bios_entry;
10512 +       int old_nr, new_nr, chg_nr;
10513 +       int i;
10514 +
10515 +       /*
10516 +               Visually we're performing the following (1,2,3,4 = memory types)...
10517 +
10518 +               Sample memory map (w/overlaps):
10519 +                  ____22__________________
10520 +                  ______________________4_
10521 +                  ____1111________________
10522 +                  _44_____________________
10523 +                  11111111________________
10524 +                  ____________________33__
10525 +                  ___________44___________
10526 +                  __________33333_________
10527 +                  ______________22________
10528 +                  ___________________2222_
10529 +                  _________111111111______
10530 +                  _____________________11_
10531 +                  _________________4______
10532 +
10533 +               Sanitized equivalent (no overlap):
10534 +                  1_______________________
10535 +                  _44_____________________
10536 +                  ___1____________________
10537 +                  ____22__________________
10538 +                  ______11________________
10539 +                  _________1______________
10540 +                  __________3_____________
10541 +                  ___________44___________
10542 +                  _____________33_________
10543 +                  _______________2________
10544 +                  ________________1_______
10545 +                  _________________4______
10546 +                  ___________________2____
10547 +                  ____________________33__
10548 +                  ______________________4_
10549 +       */
10550 +
10551 +       /* if there's only one memory region, don't bother */
10552 +       if (*pnr_map < 2)
10553 +               return -1;
10554 +
10555 +       old_nr = *pnr_map;
10556 +
10557 +       /* bail out if we find any unreasonable addresses in bios map */
10558 +       for (i=0; i<old_nr; i++)
10559 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
10560 +                       return -1;
10561 +
10562 +       /* create pointers for initial change-point information (for sorting) */
10563 +       for (i=0; i < 2*old_nr; i++)
10564 +               change_point[i] = &change_point_list[i];
10565 +
10566 +       /* record all known change-points (starting and ending addresses),
10567 +          omitting those that are for empty memory regions */
10568 +       chgidx = 0;
10569 +       for (i=0; i < old_nr; i++)      {
10570 +               if (biosmap[i].size != 0) {
10571 +                       change_point[chgidx]->addr = biosmap[i].addr;
10572 +                       change_point[chgidx++]->pbios = &biosmap[i];
10573 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
10574 +                       change_point[chgidx++]->pbios = &biosmap[i];
10575 +               }
10576 +       }
10577 +       chg_nr = chgidx;        /* true number of change-points */
10578 +
10579 +       /* sort change-point list by memory addresses (low -> high) */
10580 +       still_changing = 1;
10581 +       while (still_changing)  {
10582 +               still_changing = 0;
10583 +               for (i=1; i < chg_nr; i++)  {
10584 +                       /* if <current_addr> > <last_addr>, swap */
10585 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
10586 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
10587 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
10588 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
10589 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
10590 +                          )
10591 +                       {
10592 +                               change_tmp = change_point[i];
10593 +                               change_point[i] = change_point[i-1];
10594 +                               change_point[i-1] = change_tmp;
10595 +                               still_changing=1;
10596 +                       }
10597 +               }
10598 +       }
10599 +
10600 +       /* create a new bios memory map, removing overlaps */
10601 +       overlap_entries=0;       /* number of entries in the overlap table */
10602 +       new_bios_entry=0;        /* index for creating new bios map entries */
10603 +       last_type = 0;           /* start with undefined memory type */
10604 +       last_addr = 0;           /* start with 0 as last starting address */
10605 +       /* loop through change-points, determining affect on the new bios map */
10606 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
10607 +       {
10608 +               /* keep track of all overlapping bios entries */
10609 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
10610 +               {
10611 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
10612 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
10613 +               }
10614 +               else
10615 +               {
10616 +                       /* remove entry from list (order independent, so swap with last) */
10617 +                       for (i=0; i<overlap_entries; i++)
10618 +                       {
10619 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
10620 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
10621 +                       }
10622 +                       overlap_entries--;
10623 +               }
10624 +               /* if there are overlapping entries, decide which "type" to use */
10625 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
10626 +               current_type = 0;
10627 +               for (i=0; i<overlap_entries; i++)
10628 +                       if (overlap_list[i]->type > current_type)
10629 +                               current_type = overlap_list[i]->type;
10630 +               /* continue building up new bios map based on this information */
10631 +               if (current_type != last_type)  {
10632 +                       if (last_type != 0)      {
10633 +                               new_bios[new_bios_entry].size =
10634 +                                       change_point[chgidx]->addr - last_addr;
10635 +                               /* move forward only if the new size was non-zero */
10636 +                               if (new_bios[new_bios_entry].size != 0)
10637 +                                       if (++new_bios_entry >= E820MAX)
10638 +                                               break;  /* no more space left for new bios entries */
10639 +                       }
10640 +                       if (current_type != 0)  {
10641 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
10642 +                               new_bios[new_bios_entry].type = current_type;
10643 +                               last_addr=change_point[chgidx]->addr;
10644 +                       }
10645 +                       last_type = current_type;
10646 +               }
10647 +       }
10648 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
10649 +
10650 +       /* copy new bios mapping into original location */
10651 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
10652 +       *pnr_map = new_nr;
10653 +
10654 +       return 0;
10655 +}
10656 +
10657 +/*
10658 + * Copy the BIOS e820 map into a safe place.
10659 + *
10660 + * Sanity-check it while we're at it..
10661 + *
10662 + * If we're lucky and live on a modern system, the setup code
10663 + * will have given us a memory map that we can use to properly
10664 + * set up memory.  If we aren't, we'll fake a memory map.
10665 + *
10666 + * We check to see that the memory map contains at least 2 elements
10667 + * before we'll use it, because the detection code in setup.S may
10668 + * not be perfect and most every PC known to man has two memory
10669 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
10670 + * thinkpad 560x, for example, does not cooperate with the memory
10671 + * detection code.)
10672 + */
10673 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
10674 +{
10675 +       /* Only one memory region (or negative)? Ignore it */
10676 +       if (nr_map < 2)
10677 +               return -1;
10678 +
10679 +       do {
10680 +               unsigned long long start = biosmap->addr;
10681 +               unsigned long long size = biosmap->size;
10682 +               unsigned long long end = start + size;
10683 +               unsigned long type = biosmap->type;
10684 +
10685 +               /* Overflow in 64 bits? Ignore the memory map. */
10686 +               if (start > end)
10687 +                       return -1;
10688 +
10689 +               /*
10690 +                * Some BIOSes claim RAM in the 640k - 1M region.
10691 +                * Not right. Fix it up.
10692 +                */
10693 +               if (type == E820_RAM) {
10694 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
10695 +                               if (start < 0xA0000ULL)
10696 +                                       add_memory_region(start, 0xA0000ULL-start, type);
10697 +                               if (end <= 0x100000ULL)
10698 +                                       continue;
10699 +                               start = 0x100000ULL;
10700 +                               size = end - start;
10701 +                       }
10702 +               }
10703 +               add_memory_region(start, size, type);
10704 +       } while (biosmap++,--nr_map);
10705 +       return 0;
10706 +}
10707 +#endif
10708 +
10709 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
10710 +struct edd edd;
10711 +#ifdef CONFIG_EDD_MODULE
10712 +EXPORT_SYMBOL(edd);
10713 +#endif
10714 +/**
10715 + * copy_edd() - Copy the BIOS EDD information
10716 + *              from boot_params into a safe place.
10717 + *
10718 + */
10719 +static inline void copy_edd(void)
10720 +{
10721 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
10722 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
10723 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
10724 +     edd.edd_info_nr = EDD_NR;
10725 +}
10726 +#else
10727 +static inline void copy_edd(void)
10728 +{
10729 +}
10730 +#endif
10731 +
10732 +/*
10733 + * Do NOT EVER look at the BIOS memory size location.
10734 + * It does not work on many machines.
10735 + */
10736 +#define LOWMEMSIZE()   (0x9f000)
10737 +
10738 +static void __init parse_cmdline_early (char ** cmdline_p)
10739 +{
10740 +       char c = ' ', *to = command_line, *from = saved_command_line;
10741 +       int len = 0, max_cmdline;
10742 +       int userdef = 0;
10743 +
10744 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
10745 +               max_cmdline = COMMAND_LINE_SIZE;
10746 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
10747 +       /* Save unparsed command line copy for /proc/cmdline */
10748 +       saved_command_line[max_cmdline-1] = '\0';
10749 +
10750 +       for (;;) {
10751 +               if (c != ' ')
10752 +                       goto next_char;
10753 +               /*
10754 +                * "mem=nopentium" disables the 4MB page tables.
10755 +                * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
10756 +                * to <mem>, overriding the bios size.
10757 +                * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
10758 +                * <start> to <start>+<mem>, overriding the bios size.
10759 +                *
10760 +                * HPA tells me bootloaders need to parse mem=, so no new
10761 +                * option should be mem=  [also see Documentation/i386/boot.txt]
10762 +                */
10763 +               if (!memcmp(from, "mem=", 4)) {
10764 +                       if (to != command_line)
10765 +                               to--;
10766 +                       if (!memcmp(from+4, "nopentium", 9)) {
10767 +                               from += 9+4;
10768 +                               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
10769 +                               disable_pse = 1;
10770 +                       } else {
10771 +                               /* If the user specifies memory size, we
10772 +                                * limit the BIOS-provided memory map to
10773 +                                * that size. exactmap can be used to specify
10774 +                                * the exact map. mem=number can be used to
10775 +                                * trim the existing memory map.
10776 +                                */
10777 +                               unsigned long long mem_size;
10778
10779 +                               mem_size = memparse(from+4, &from);
10780 +#if 0
10781 +                               limit_regions(mem_size);
10782 +                               userdef=1;
10783 +#else
10784 +                               xen_override_max_pfn =
10785 +                                       (unsigned long)(mem_size>>PAGE_SHIFT);
10786 +#endif
10787 +                       }
10788 +               }
10789 +
10790 +               else if (!memcmp(from, "memmap=", 7)) {
10791 +                       if (to != command_line)
10792 +                               to--;
10793 +                       if (!memcmp(from+7, "exactmap", 8)) {
10794 +#ifdef CONFIG_CRASH_DUMP
10795 +                               /* If we are doing a crash dump, we
10796 +                                * still need to know the real mem
10797 +                                * size before original memory map is
10798 +                                * reset.
10799 +                                */
10800 +                               find_max_pfn();
10801 +                               saved_max_pfn = max_pfn;
10802 +#endif
10803 +                               from += 8+7;
10804 +                               e820.nr_map = 0;
10805 +                               userdef = 1;
10806 +                       } else {
10807 +                               /* If the user specifies memory size, we
10808 +                                * limit the BIOS-provided memory map to
10809 +                                * that size. exactmap can be used to specify
10810 +                                * the exact map. mem=number can be used to
10811 +                                * trim the existing memory map.
10812 +                                */
10813 +                               unsigned long long start_at, mem_size;
10814
10815 +                               mem_size = memparse(from+7, &from);
10816 +                               if (*from == '@') {
10817 +                                       start_at = memparse(from+1, &from);
10818 +                                       add_memory_region(start_at, mem_size, E820_RAM);
10819 +                               } else if (*from == '#') {
10820 +                                       start_at = memparse(from+1, &from);
10821 +                                       add_memory_region(start_at, mem_size, E820_ACPI);
10822 +                               } else if (*from == '$') {
10823 +                                       start_at = memparse(from+1, &from);
10824 +                                       add_memory_region(start_at, mem_size, E820_RESERVED);
10825 +                               } else {
10826 +                                       limit_regions(mem_size);
10827 +                                       userdef=1;
10828 +                               }
10829 +                       }
10830 +               }
10831 +
10832 +               else if (!memcmp(from, "noexec=", 7))
10833 +                       noexec_setup(from + 7);
10834 +
10835 +
10836 +#ifdef  CONFIG_X86_MPPARSE
10837 +               /*
10838 +                * If the BIOS enumerates physical processors before logical,
10839 +                * maxcpus=N at enumeration-time can be used to disable HT.
10840 +                */
10841 +               else if (!memcmp(from, "maxcpus=", 8)) {
10842 +                       extern unsigned int maxcpus;
10843 +
10844 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
10845 +               }
10846 +#endif
10847 +
10848 +#ifdef CONFIG_ACPI
10849 +               /* "acpi=off" disables both ACPI table parsing and interpreter */
10850 +               else if (!memcmp(from, "acpi=off", 8)) {
10851 +                       disable_acpi();
10852 +               }
10853 +
10854 +               /* acpi=force to over-ride black-list */
10855 +               else if (!memcmp(from, "acpi=force", 10)) {
10856 +                       acpi_force = 1;
10857 +                       acpi_ht = 1;
10858 +                       acpi_disabled = 0;
10859 +               }
10860 +
10861 +               /* acpi=strict disables out-of-spec workarounds */
10862 +               else if (!memcmp(from, "acpi=strict", 11)) {
10863 +                       acpi_strict = 1;
10864 +               }
10865 +
10866 +               /* Limit ACPI just to boot-time to enable HT */
10867 +               else if (!memcmp(from, "acpi=ht", 7)) {
10868 +                       if (!acpi_force)
10869 +                               disable_acpi();
10870 +                       acpi_ht = 1;
10871 +               }
10872 +               
10873 +               /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
10874 +               else if (!memcmp(from, "pci=noacpi", 10)) {
10875 +                       acpi_disable_pci();
10876 +               }
10877 +               /* "acpi=noirq" disables ACPI interrupt routing */
10878 +               else if (!memcmp(from, "acpi=noirq", 10)) {
10879 +                       acpi_noirq_set();
10880 +               }
10881 +
10882 +               else if (!memcmp(from, "acpi_sci=edge", 13))
10883 +                       acpi_sci_flags.trigger =  1;
10884 +
10885 +               else if (!memcmp(from, "acpi_sci=level", 14))
10886 +                       acpi_sci_flags.trigger = 3;
10887 +
10888 +               else if (!memcmp(from, "acpi_sci=high", 13))
10889 +                       acpi_sci_flags.polarity = 1;
10890 +
10891 +               else if (!memcmp(from, "acpi_sci=low", 12))
10892 +                       acpi_sci_flags.polarity = 3;
10893 +
10894 +#ifdef CONFIG_X86_IO_APIC
10895 +               else if (!memcmp(from, "acpi_skip_timer_override", 24))
10896 +                       acpi_skip_timer_override = 1;
10897 +
10898 +               if (!memcmp(from, "disable_timer_pin_1", 19))
10899 +                       disable_timer_pin_1 = 1;
10900 +               if (!memcmp(from, "enable_timer_pin_1", 18))
10901 +                       disable_timer_pin_1 = -1;
10902 +
10903 +               /* disable IO-APIC */
10904 +               else if (!memcmp(from, "noapic", 6))
10905 +                       disable_ioapic_setup();
10906 +#endif /* CONFIG_X86_IO_APIC */
10907 +#endif /* CONFIG_ACPI */
10908 +
10909 +#ifdef CONFIG_X86_LOCAL_APIC
10910 +               /* enable local APIC */
10911 +               else if (!memcmp(from, "lapic", 5))
10912 +                       lapic_enable();
10913 +
10914 +               /* disable local APIC */
10915 +               else if (!memcmp(from, "nolapic", 6))
10916 +                       lapic_disable();
10917 +#endif /* CONFIG_X86_LOCAL_APIC */
10918 +
10919 +#ifdef CONFIG_KEXEC
10920 +               /* crashkernel=size@addr specifies the location to reserve for
10921 +                * a crash kernel.  By reserving this memory we guarantee
10922 +                * that linux never set's it up as a DMA target.
10923 +                * Useful for holding code to do something appropriate
10924 +                * after a kernel panic.
10925 +                */
10926 +               else if (!memcmp(from, "crashkernel=", 12)) {
10927 +                       unsigned long size, base;
10928 +                       size = memparse(from+12, &from);
10929 +                       if (*from == '@') {
10930 +                               base = memparse(from+1, &from);
10931 +                               /* FIXME: Do I want a sanity check
10932 +                                * to validate the memory range?
10933 +                                */
10934 +                               crashk_res.start = base;
10935 +                               crashk_res.end   = base + size - 1;
10936 +                       }
10937 +               }
10938 +#endif
10939 +#ifdef CONFIG_PROC_VMCORE
10940 +               /* elfcorehdr= specifies the location of elf core header
10941 +                * stored by the crashed kernel.
10942 +                */
10943 +               else if (!memcmp(from, "elfcorehdr=", 11))
10944 +                       elfcorehdr_addr = memparse(from+11, &from);
10945 +#endif
10946 +
10947 +               /*
10948 +                * highmem=size forces highmem to be exactly 'size' bytes.
10949 +                * This works even on boxes that have no highmem otherwise.
10950 +                * This also works to reduce highmem size on bigger boxes.
10951 +                */
10952 +               else if (!memcmp(from, "highmem=", 8))
10953 +                       highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
10954 +       
10955 +               /*
10956 +                * vmalloc=size forces the vmalloc area to be exactly 'size'
10957 +                * bytes. This can be used to increase (or decrease) the
10958 +                * vmalloc area - the default is 128m.
10959 +                */
10960 +               else if (!memcmp(from, "vmalloc=", 8))
10961 +                       __VMALLOC_RESERVE = memparse(from+8, &from);
10962 +
10963 +       next_char:
10964 +               c = *(from++);
10965 +               if (!c)
10966 +                       break;
10967 +               if (COMMAND_LINE_SIZE <= ++len)
10968 +                       break;
10969 +               *(to++) = c;
10970 +       }
10971 +       *to = '\0';
10972 +       *cmdline_p = command_line;
10973 +       if (userdef) {
10974 +               printk(KERN_INFO "user-defined physical RAM map:\n");
10975 +               print_memory_map("user");
10976 +       }
10977 +}
10978 +
10979 +#if 0 /* !XEN */
10980 +/*
10981 + * Callback for efi_memory_walk.
10982 + */
10983 +static int __init
10984 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
10985 +{
10986 +       unsigned long *max_pfn = arg, pfn;
10987 +
10988 +       if (start < end) {
10989 +               pfn = PFN_UP(end -1);
10990 +               if (pfn > *max_pfn)
10991 +                       *max_pfn = pfn;
10992 +       }
10993 +       return 0;
10994 +}
10995 +
10996 +static int __init
10997 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
10998 +{
10999 +       memory_present(0, start, end);
11000 +       return 0;
11001 +}
11002 +
11003 +/*
11004 + * Find the highest page frame number we have available
11005 + */
11006 +void __init find_max_pfn(void)
11007 +{
11008 +       int i;
11009 +
11010 +       max_pfn = 0;
11011 +       if (efi_enabled) {
11012 +               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
11013 +               efi_memmap_walk(efi_memory_present_wrapper, NULL);
11014 +               return;
11015 +       }
11016 +
11017 +       for (i = 0; i < e820.nr_map; i++) {
11018 +               unsigned long start, end;
11019 +               /* RAM? */
11020 +               if (e820.map[i].type != E820_RAM)
11021 +                       continue;
11022 +               start = PFN_UP(e820.map[i].addr);
11023 +               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11024 +               if (start >= end)
11025 +                       continue;
11026 +               if (end > max_pfn)
11027 +                       max_pfn = end;
11028 +               memory_present(0, start, end);
11029 +       }
11030 +}
11031 +#else
11032 +/* We don't use the fake e820 because we need to respond to user override. */
11033 +void __init find_max_pfn(void)
11034 +{
11035 +       if (xen_override_max_pfn == 0) {
11036 +               max_pfn = xen_start_info->nr_pages;
11037 +               /* Default 8MB slack (to balance backend allocations). */
11038 +               max_pfn += 8 << (20 - PAGE_SHIFT);
11039 +       } else if (xen_override_max_pfn > xen_start_info->nr_pages) {
11040 +               max_pfn = xen_override_max_pfn;
11041 +       } else {
11042 +               max_pfn = xen_start_info->nr_pages;
11043 +       }
11044 +}
11045 +#endif /* XEN */
11046 +
11047 +/*
11048 + * Determine low and high memory ranges:
11049 + */
11050 +unsigned long __init find_max_low_pfn(void)
11051 +{
11052 +       unsigned long max_low_pfn;
11053 +
11054 +       max_low_pfn = max_pfn;
11055 +       if (max_low_pfn > MAXMEM_PFN) {
11056 +               if (highmem_pages == -1)
11057 +                       highmem_pages = max_pfn - MAXMEM_PFN;
11058 +               if (highmem_pages + MAXMEM_PFN < max_pfn)
11059 +                       max_pfn = MAXMEM_PFN + highmem_pages;
11060 +               if (highmem_pages + MAXMEM_PFN > max_pfn) {
11061 +                       printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
11062 +                       highmem_pages = 0;
11063 +               }
11064 +               max_low_pfn = MAXMEM_PFN;
11065 +#ifndef CONFIG_HIGHMEM
11066 +               /* Maximum memory usable is what is directly addressable */
11067 +               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
11068 +                                       MAXMEM>>20);
11069 +               if (max_pfn > MAX_NONPAE_PFN)
11070 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11071 +               else
11072 +                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
11073 +               max_pfn = MAXMEM_PFN;
11074 +#else /* !CONFIG_HIGHMEM */
11075 +#ifndef CONFIG_X86_PAE
11076 +               if (max_pfn > MAX_NONPAE_PFN) {
11077 +                       max_pfn = MAX_NONPAE_PFN;
11078 +                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
11079 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11080 +               }
11081 +#endif /* !CONFIG_X86_PAE */
11082 +#endif /* !CONFIG_HIGHMEM */
11083 +       } else {
11084 +               if (highmem_pages == -1)
11085 +                       highmem_pages = 0;
11086 +#ifdef CONFIG_HIGHMEM
11087 +               if (highmem_pages >= max_pfn) {
11088 +                       printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
11089 +                       highmem_pages = 0;
11090 +               }
11091 +               if (highmem_pages) {
11092 +                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
11093 +                               printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
11094 +                               highmem_pages = 0;
11095 +                       }
11096 +                       max_low_pfn -= highmem_pages;
11097 +               }
11098 +#else
11099 +               if (highmem_pages)
11100 +                       printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
11101 +#endif
11102 +       }
11103 +       return max_low_pfn;
11104 +}
11105 +
11106 +/*
11107 + * Free all available memory for boot time allocation.  Used
11108 + * as a callback function by efi_memory_walk()
11109 + */
11110 +
11111 +static int __init
11112 +free_available_memory(unsigned long start, unsigned long end, void *arg)
11113 +{
11114 +       /* check max_low_pfn */
11115 +       if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
11116 +               return 0;
11117 +       if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
11118 +               end = (max_low_pfn + 1) << PAGE_SHIFT;
11119 +       if (start < end)
11120 +               free_bootmem(start, end - start);
11121 +
11122 +       return 0;
11123 +}
11124 +/*
11125 + * Register fully available low RAM pages with the bootmem allocator.
11126 + */
11127 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
11128 +{
11129 +       int i;
11130 +
11131 +       if (efi_enabled) {
11132 +               efi_memmap_walk(free_available_memory, NULL);
11133 +               return;
11134 +       }
11135 +       for (i = 0; i < e820.nr_map; i++) {
11136 +               unsigned long curr_pfn, last_pfn, size;
11137 +               /*
11138 +                * Reserve usable low memory
11139 +                */
11140 +               if (e820.map[i].type != E820_RAM)
11141 +                       continue;
11142 +               /*
11143 +                * We are rounding up the start address of usable memory:
11144 +                */
11145 +               curr_pfn = PFN_UP(e820.map[i].addr);
11146 +               if (curr_pfn >= max_low_pfn)
11147 +                       continue;
11148 +               /*
11149 +                * ... and at the end of the usable range downwards:
11150 +                */
11151 +               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11152 +
11153 +               if (last_pfn > max_low_pfn)
11154 +                       last_pfn = max_low_pfn;
11155 +
11156 +               /*
11157 +                * .. finally, did all the rounding and playing
11158 +                * around just make the area go away?
11159 +                */
11160 +               if (last_pfn <= curr_pfn)
11161 +                       continue;
11162 +
11163 +               size = last_pfn - curr_pfn;
11164 +               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
11165 +       }
11166 +}
11167 +
11168 +#ifndef CONFIG_XEN
11169 +/*
11170 + * workaround for Dell systems that neglect to reserve EBDA
11171 + */
11172 +static void __init reserve_ebda_region(void)
11173 +{
11174 +       unsigned int addr;
11175 +       addr = get_bios_ebda();
11176 +       if (addr)
11177 +               reserve_bootmem(addr, PAGE_SIZE);       
11178 +}
11179 +#endif
11180 +
11181 +#ifndef CONFIG_NEED_MULTIPLE_NODES
11182 +void __init setup_bootmem_allocator(void);
11183 +static unsigned long __init setup_memory(void)
11184 +{
11185 +       /*
11186 +        * partially used pages are not usable - thus
11187 +        * we are rounding upwards:
11188 +        */
11189 +       min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
11190 +               xen_start_info->nr_pt_frames;
11191 +
11192 +       find_max_pfn();
11193 +
11194 +       max_low_pfn = find_max_low_pfn();
11195 +
11196 +#ifdef CONFIG_HIGHMEM
11197 +       highstart_pfn = highend_pfn = max_pfn;
11198 +       if (max_pfn > max_low_pfn) {
11199 +               highstart_pfn = max_low_pfn;
11200 +       }
11201 +       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
11202 +               pages_to_mb(highend_pfn - highstart_pfn));
11203 +#endif
11204 +       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
11205 +                       pages_to_mb(max_low_pfn));
11206 +
11207 +       setup_bootmem_allocator();
11208 +
11209 +       return max_low_pfn;
11210 +}
11211 +
11212 +void __init zone_sizes_init(void)
11213 +{
11214 +       unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
11215 +       unsigned int max_dma, low;
11216 +
11217 +       /*
11218 +        * XEN: Our notion of "DMA memory" is fake when running over Xen.
11219 +        * We simply put all RAM in the DMA zone so that those drivers which
11220 +        * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
11221 +        * Those drivers that *do* require lowmem are screwed anyway when
11222 +        * running over Xen!
11223 +        */
11224 +       max_dma = max_low_pfn;
11225 +       low = max_low_pfn;
11226 +
11227 +       if (low < max_dma)
11228 +               zones_size[ZONE_DMA] = low;
11229 +       else {
11230 +               zones_size[ZONE_DMA] = max_dma;
11231 +               zones_size[ZONE_NORMAL] = low - max_dma;
11232 +#ifdef CONFIG_HIGHMEM
11233 +               zones_size[ZONE_HIGHMEM] = highend_pfn - low;
11234 +#endif
11235 +       }
11236 +       free_area_init(zones_size);
11237 +}
11238 +#else
11239 +extern unsigned long __init setup_memory(void);
11240 +extern void zone_sizes_init(void);
11241 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
11242 +
11243 +void __init setup_bootmem_allocator(void)
11244 +{
11245 +       unsigned long bootmap_size;
11246 +       /*
11247 +        * Initialize the boot-time allocator (with low memory only):
11248 +        */
11249 +       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
11250 +
11251 +       register_bootmem_low_pages(max_low_pfn);
11252 +
11253 +       /*
11254 +        * Reserve the bootmem bitmap itself as well. We do this in two
11255 +        * steps (first step was init_bootmem()) because this catches
11256 +        * the (very unlikely) case of us accidentally initializing the
11257 +        * bootmem allocator with an invalid RAM area.
11258 +        */
11259 +       reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
11260 +                        bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
11261 +
11262 +#ifndef CONFIG_XEN
11263 +       /*
11264 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
11265 +        * enabling clean reboots, SMP operation, laptop functions.
11266 +        */
11267 +       reserve_bootmem(0, PAGE_SIZE);
11268 +
11269 +       /* reserve EBDA region, it's a 4K region */
11270 +       reserve_ebda_region();
11271 +
11272 +    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
11273 +       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
11274 +       unless you have no PS/2 mouse plugged in. */
11275 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
11276 +           boot_cpu_data.x86 == 6)
11277 +            reserve_bootmem(0xa0000 - 4096, 4096);
11278 +
11279 +#ifdef CONFIG_SMP
11280 +       /*
11281 +        * But first pinch a few for the stack/trampoline stuff
11282 +        * FIXME: Don't need the extra page at 4K, but need to fix
11283 +        * trampoline before removing it. (see the GDT stuff)
11284 +        */
11285 +       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
11286 +#endif
11287 +#ifdef CONFIG_ACPI_SLEEP
11288 +       /*
11289 +        * Reserve low memory region for sleep support.
11290 +        */
11291 +       acpi_reserve_bootmem();
11292 +#endif
11293 +#endif /* !CONFIG_XEN */
11294 +
11295 +#ifdef CONFIG_BLK_DEV_INITRD
11296 +       if (xen_start_info->mod_start) {
11297 +               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
11298 +                       /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
11299 +                       initrd_start = INITRD_START + PAGE_OFFSET;
11300 +                       initrd_end = initrd_start+INITRD_SIZE;
11301 +                       initrd_below_start_ok = 1;
11302 +               }
11303 +               else {
11304 +                       printk(KERN_ERR "initrd extends beyond end of memory "
11305 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11306 +                           INITRD_START + INITRD_SIZE,
11307 +                           max_low_pfn << PAGE_SHIFT);
11308 +                       initrd_start = 0;
11309 +               }
11310 +       }
11311 +#endif
11312 +
11313 +       if (!xen_feature(XENFEAT_auto_translated_physmap))
11314 +               phys_to_machine_mapping =
11315 +                       (unsigned long *)xen_start_info->mfn_list;
11316 +}
11317 +
11318 +/*
11319 + * The node 0 pgdat is initialized before all of these because
11320 + * it's needed for bootmem.  node>0 pgdats have their virtual
11321 + * space allocated before the pagetables are in place to access
11322 + * them, so they can't be cleared then.
11323 + *
11324 + * This should all compile down to nothing when NUMA is off.
11325 + */
11326 +void __init remapped_pgdat_init(void)
11327 +{
11328 +       int nid;
11329 +
11330 +       for_each_online_node(nid) {
11331 +               if (nid != 0)
11332 +                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
11333 +       }
11334 +}
11335 +
11336 +/*
11337 + * Request address space for all standard RAM and ROM resources
11338 + * and also for regions reported as reserved by the e820.
11339 + */
11340 +static void __init
11341 +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
11342 +{
11343 +       int i;
11344 +#ifdef CONFIG_XEN
11345 +       dom0_op_t op;
11346 +       struct dom0_memory_map_entry *map;
11347 +       unsigned long gapstart, gapsize;
11348 +       unsigned long long last;
11349 +#endif
11350 +
11351 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
11352 +       probe_roms();
11353 +#endif
11354 +
11355 +#ifdef CONFIG_XEN
11356 +       map = alloc_bootmem_low_pages(PAGE_SIZE);
11357 +       op.cmd = DOM0_PHYSICAL_MEMORY_MAP;
11358 +       op.u.physical_memory_map.memory_map = map;
11359 +       op.u.physical_memory_map.max_map_entries =
11360 +               PAGE_SIZE / sizeof(struct dom0_memory_map_entry);
11361 +       BUG_ON(HYPERVISOR_dom0_op(&op));
11362 +
11363 +       last = 0x100000000ULL;
11364 +       gapstart = 0x10000000;
11365 +       gapsize = 0x400000;
11366 +
11367 +       for (i = op.u.physical_memory_map.nr_map_entries - 1; i >= 0; i--) {
11368 +               struct resource *res;
11369 +
11370 +               if ((last > map[i].end) && ((last - map[i].end) > gapsize)) {
11371 +                       gapsize = last - map[i].end;
11372 +                       gapstart = map[i].end;
11373 +               }
11374 +               if (map[i].start < last)
11375 +                       last = map[i].start;
11376 +
11377 +               if (map[i].end > 0x100000000ULL)
11378 +                       continue;
11379 +               res = alloc_bootmem_low(sizeof(struct resource));
11380 +               res->name = map[i].is_ram ? "System RAM" : "reserved";
11381 +               res->start = map[i].start;
11382 +               res->end = map[i].end - 1;
11383 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
11384 +               request_resource(&iomem_resource, res);
11385 +       }
11386 +
11387 +       free_bootmem(__pa(map), PAGE_SIZE);
11388 +
11389 +       /*
11390 +        * Start allocating dynamic PCI memory a bit into the gap,
11391 +        * aligned up to the nearest megabyte.
11392 +        *
11393 +        * Question: should we try to pad it up a bit (do something
11394 +        * like " + (gapsize >> 3)" in there too?). We now have the
11395 +        * technology.
11396 +        */
11397 +       pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
11398 +
11399 +       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
11400 +               pci_mem_start, gapstart, gapsize);
11401 +#else
11402 +       for (i = 0; i < e820.nr_map; i++) {
11403 +               struct resource *res;
11404 +               if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
11405 +                       continue;
11406 +               res = alloc_bootmem_low(sizeof(struct resource));
11407 +               switch (e820.map[i].type) {
11408 +               case E820_RAM:  res->name = "System RAM"; break;
11409 +               case E820_ACPI: res->name = "ACPI Tables"; break;
11410 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
11411 +               default:        res->name = "reserved";
11412 +               }
11413 +               res->start = e820.map[i].addr;
11414 +               res->end = res->start + e820.map[i].size - 1;
11415 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
11416 +               request_resource(&iomem_resource, res);
11417 +               if (e820.map[i].type == E820_RAM) {
11418 +                       /*
11419 +                        *  We don't know which RAM region contains kernel data,
11420 +                        *  so we try it repeatedly and let the resource manager
11421 +                        *  test it.
11422 +                        */
11423 +                       request_resource(res, code_resource);
11424 +                       request_resource(res, data_resource);
11425 +#ifdef CONFIG_KEXEC
11426 +                       request_resource(res, &crashk_res);
11427 +#endif
11428 +               }
11429 +       }
11430 +#endif
11431 +#ifdef CONFIG_KEXEC
11432 +       if (crashk_res.start != crashk_res.end)
11433 +               reserve_bootmem(crashk_res.start,
11434 +                       crashk_res.end - crashk_res.start + 1);
11435 +#endif
11436 +}
11437 +
11438 +/*
11439 + * Request address space for all standard resources
11440 + */
11441 +static void __init register_memory(void)
11442 +{
11443 +#ifndef CONFIG_XEN
11444 +       unsigned long gapstart, gapsize, round;
11445 +       unsigned long long last;
11446 +#endif
11447 +       int           i;
11448 +
11449 +       /* Nothing to do if not running in dom0. */
11450 +       if (!(xen_start_info->flags & SIF_INITDOMAIN))
11451 +               return;
11452 +
11453 +       if (efi_enabled)
11454 +               efi_initialize_iomem_resources(&code_resource, &data_resource);
11455 +       else
11456 +               legacy_init_iomem_resources(&code_resource, &data_resource);
11457 +
11458 +       /* EFI systems may still have VGA */
11459 +       request_resource(&iomem_resource, &video_ram_resource);
11460 +
11461 +       /* request I/O space for devices used on all i[345]86 PCs */
11462 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
11463 +               request_resource(&ioport_resource, &standard_io_resources[i]);
11464 +
11465 +#ifndef CONFIG_XEN
11466 +       /*
11467 +        * Search for the bigest gap in the low 32 bits of the e820
11468 +        * memory space.
11469 +        */
11470 +       last = 0x100000000ull;
11471 +       gapstart = 0x10000000;
11472 +       gapsize = 0x400000;
11473 +       i = e820.nr_map;
11474 +       while (--i >= 0) {
11475 +               unsigned long long start = e820.map[i].addr;
11476 +               unsigned long long end = start + e820.map[i].size;
11477 +
11478 +               /*
11479 +                * Since "last" is at most 4GB, we know we'll
11480 +                * fit in 32 bits if this condition is true
11481 +                */
11482 +               if (last > end) {
11483 +                       unsigned long gap = last - end;
11484 +
11485 +                       if (gap > gapsize) {
11486 +                               gapsize = gap;
11487 +                               gapstart = end;
11488 +                       }
11489 +               }
11490 +               if (start < last)
11491 +                       last = start;
11492 +       }
11493 +
11494 +       /*
11495 +        * See how much we want to round up: start off with
11496 +        * rounding to the next 1MB area.
11497 +        */
11498 +       round = 0x100000;
11499 +       while ((gapsize >> 4) > round)
11500 +               round += round;
11501 +       /* Fun with two's complement */
11502 +       pci_mem_start = (gapstart + round) & -round;
11503 +
11504 +       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
11505 +               pci_mem_start, gapstart, gapsize);
11506 +#endif
11507 +}
11508 +
11509 +/* Use inline assembly to define this because the nops are defined 
11510 +   as inline assembly strings in the include files and we cannot 
11511 +   get them easily into strings. */
11512 +asm("\t.data\nintelnops: " 
11513 +    GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
11514 +    GENERIC_NOP7 GENERIC_NOP8); 
11515 +asm("\t.data\nk8nops: " 
11516 +    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
11517 +    K8_NOP7 K8_NOP8); 
11518 +asm("\t.data\nk7nops: " 
11519 +    K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
11520 +    K7_NOP7 K7_NOP8); 
11521 +    
11522 +extern unsigned char intelnops[], k8nops[], k7nops[];
11523 +static unsigned char *intel_nops[ASM_NOP_MAX+1] = { 
11524 +     NULL,
11525 +     intelnops,
11526 +     intelnops + 1,
11527 +     intelnops + 1 + 2,
11528 +     intelnops + 1 + 2 + 3,
11529 +     intelnops + 1 + 2 + 3 + 4,
11530 +     intelnops + 1 + 2 + 3 + 4 + 5,
11531 +     intelnops + 1 + 2 + 3 + 4 + 5 + 6,
11532 +     intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
11533 +}; 
11534 +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
11535 +     NULL,
11536 +     k8nops,
11537 +     k8nops + 1,
11538 +     k8nops + 1 + 2,
11539 +     k8nops + 1 + 2 + 3,
11540 +     k8nops + 1 + 2 + 3 + 4,
11541 +     k8nops + 1 + 2 + 3 + 4 + 5,
11542 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
11543 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
11544 +}; 
11545 +static unsigned char *k7_nops[ASM_NOP_MAX+1] = { 
11546 +     NULL,
11547 +     k7nops,
11548 +     k7nops + 1,
11549 +     k7nops + 1 + 2,
11550 +     k7nops + 1 + 2 + 3,
11551 +     k7nops + 1 + 2 + 3 + 4,
11552 +     k7nops + 1 + 2 + 3 + 4 + 5,
11553 +     k7nops + 1 + 2 + 3 + 4 + 5 + 6,
11554 +     k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
11555 +}; 
11556 +static struct nop { 
11557 +     int cpuid; 
11558 +     unsigned char **noptable; 
11559 +} noptypes[] = { 
11560 +     { X86_FEATURE_K8, k8_nops }, 
11561 +     { X86_FEATURE_K7, k7_nops }, 
11562 +     { -1, NULL }
11563 +}; 
11564 +
11565 +/* Replace instructions with better alternatives for this CPU type.
11566 +
11567 +   This runs before SMP is initialized to avoid SMP problems with
11568 +   self modifying code. This implies that assymetric systems where
11569 +   APs have less capabilities than the boot processor are not handled. 
11570 +   Tough. Make sure you disable such features by hand. */ 
11571 +void apply_alternatives(void *start, void *end) 
11572 +{ 
11573 +       struct alt_instr *a; 
11574 +       int diff, i, k;
11575 +        unsigned char **noptable = intel_nops; 
11576 +       for (i = 0; noptypes[i].cpuid >= 0; i++) { 
11577 +               if (boot_cpu_has(noptypes[i].cpuid)) { 
11578 +                       noptable = noptypes[i].noptable;
11579 +                       break;
11580 +               }
11581 +       } 
11582 +       for (a = start; (void *)a < end; a++) { 
11583 +               if (!boot_cpu_has(a->cpuid))
11584 +                       continue;
11585 +               BUG_ON(a->replacementlen > a->instrlen); 
11586 +               memcpy(a->instr, a->replacement, a->replacementlen); 
11587 +               diff = a->instrlen - a->replacementlen; 
11588 +               /* Pad the rest with nops */
11589 +               for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
11590 +                       k = diff;
11591 +                       if (k > ASM_NOP_MAX)
11592 +                               k = ASM_NOP_MAX;
11593 +                       memcpy(a->instr + i, noptable[k], k); 
11594 +               } 
11595 +       }
11596 +} 
11597 +
11598 +void __init alternative_instructions(void)
11599 +{
11600 +       extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
11601 +       apply_alternatives(__alt_instructions, __alt_instructions_end);
11602 +}
11603 +
11604 +static char * __init machine_specific_memory_setup(void);
11605 +
11606 +#ifdef CONFIG_MCA
11607 +static void set_mca_bus(int x)
11608 +{
11609 +       MCA_bus = x;
11610 +}
11611 +#else
11612 +static void set_mca_bus(int x) { }
11613 +#endif
11614 +
11615 +/*
11616 + * Determine if we were loaded by an EFI loader.  If so, then we have also been
11617 + * passed the efi memmap, systab, etc., so we should use these data structures
11618 + * for initialization.  Note, the efi init code path is determined by the
11619 + * global efi_enabled. This allows the same kernel image to be used on existing
11620 + * systems (with a traditional BIOS) as well as on EFI systems.
11621 + */
11622 +void __init setup_arch(char **cmdline_p)
11623 +{
11624 +       int i, j, k, fpp;
11625 +       physdev_op_t op;
11626 +       unsigned long max_low_pfn;
11627 +
11628 +       /* Force a quick death if the kernel panics (not domain 0). */
11629 +       extern int panic_timeout;
11630 +       if (!panic_timeout && !(xen_start_info->flags & SIF_INITDOMAIN))
11631 +               panic_timeout = 1;
11632 +
11633 +       /* Register a call for panic conditions. */
11634 +       notifier_chain_register(&panic_notifier_list, &xen_panic_block);
11635 +
11636 +       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
11637 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
11638 +                            VMASST_TYPE_writable_pagetables);
11639 +
11640 +       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
11641 +       early_cpu_init();
11642 +
11643 +       /*
11644 +        * FIXME: This isn't an official loader_type right
11645 +        * now but does currently work with elilo.
11646 +        * If we were configured as an EFI kernel, check to make
11647 +        * sure that we were loaded correctly from elilo and that
11648 +        * the system table is valid.  If not, then initialize normally.
11649 +        */
11650 +#ifdef CONFIG_EFI
11651 +       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
11652 +               efi_enabled = 1;
11653 +#endif
11654 +
11655 +       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
11656 +          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
11657 +       */
11658 +       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
11659 +       drive_info = DRIVE_INFO;
11660 +       screen_info = SCREEN_INFO;
11661 +       edid_info = EDID_INFO;
11662 +       apm_info.bios = APM_BIOS_INFO;
11663 +       ist_info = IST_INFO;
11664 +       saved_videomode = VIDEO_MODE;
11665 +       if( SYS_DESC_TABLE.length != 0 ) {
11666 +               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
11667 +               machine_id = SYS_DESC_TABLE.table[0];
11668 +               machine_submodel_id = SYS_DESC_TABLE.table[1];
11669 +               BIOS_revision = SYS_DESC_TABLE.table[2];
11670 +       }
11671 +       bootloader_type = LOADER_TYPE;
11672 +
11673 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
11674 +               /* This is drawn from a dump from vgacon:startup in
11675 +                * standard Linux. */
11676 +               screen_info.orig_video_mode = 3; 
11677 +               screen_info.orig_video_isVGA = 1;
11678 +               screen_info.orig_video_lines = 25;
11679 +               screen_info.orig_video_cols = 80;
11680 +               screen_info.orig_video_ega_bx = 3;
11681 +               screen_info.orig_video_points = 16;
11682 +       } else
11683 +               screen_info.orig_video_isVGA = 0;
11684 +
11685 +#ifdef CONFIG_BLK_DEV_RAM
11686 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
11687 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
11688 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
11689 +#endif
11690 +
11691 +       setup_xen_features();
11692 +
11693 +       ARCH_SETUP
11694 +       if (efi_enabled)
11695 +               efi_init();
11696 +       else {
11697 +               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
11698 +               print_memory_map(machine_specific_memory_setup());
11699 +       }
11700 +
11701 +       copy_edd();
11702 +
11703 +       if (!MOUNT_ROOT_RDONLY)
11704 +               root_mountflags &= ~MS_RDONLY;
11705 +       init_mm.start_code = (unsigned long) _text;
11706 +       init_mm.end_code = (unsigned long) _etext;
11707 +       init_mm.end_data = (unsigned long) _edata;
11708 +       init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
11709 +                      xen_start_info->nr_pt_frames) << PAGE_SHIFT;
11710 +
11711 +       /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */
11712 +       /*code_resource.start = virt_to_phys(_text);*/
11713 +       /*code_resource.end = virt_to_phys(_etext)-1;*/
11714 +       /*data_resource.start = virt_to_phys(_etext);*/
11715 +       /*data_resource.end = virt_to_phys(_edata)-1;*/
11716 +
11717 +       parse_cmdline_early(cmdline_p);
11718 +
11719 +       max_low_pfn = setup_memory();
11720 +
11721 +       /*
11722 +        * NOTE: before this point _nobody_ is allowed to allocate
11723 +        * any memory using the bootmem allocator.  Although the
11724 +        * alloctor is now initialised only the first 8Mb of the kernel
11725 +        * virtual address space has been mapped.  All allocations before
11726 +        * paging_init() has completed must use the alloc_bootmem_low_pages()
11727 +        * variant (which allocates DMA'able memory) and care must be taken
11728 +        * not to exceed the 8Mb limit.
11729 +        */
11730 +
11731 +#ifdef CONFIG_SMP
11732 +       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
11733 +#endif
11734 +       paging_init();
11735 +       remapped_pgdat_init();
11736 +       sparse_init();
11737 +       zone_sizes_init();
11738 +
11739 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
11740 +       /*
11741 +        * Find and reserve possible boot-time SMP configuration:
11742 +        */
11743 +       find_smp_config();
11744 +#endif
11745 +
11746 +       /* Make sure we have a correctly sized P->M table. */
11747 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
11748 +               phys_to_machine_mapping = alloc_bootmem_low_pages(
11749 +                    max_pfn * sizeof(unsigned long));
11750 +               memset(phys_to_machine_mapping, ~0,
11751 +                      max_pfn * sizeof(unsigned long));
11752 +               memcpy(phys_to_machine_mapping,
11753 +                      (unsigned long *)xen_start_info->mfn_list,
11754 +                      xen_start_info->nr_pages * sizeof(unsigned long));
11755 +               free_bootmem(
11756 +                    __pa(xen_start_info->mfn_list),
11757 +                    PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
11758 +                                    sizeof(unsigned long))));
11759 +
11760 +               /*
11761 +                * Initialise the list of the frames that specify the list of
11762 +                * frames that make up the p2m table. Used by save/restore
11763 +                */
11764 +               pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
11765 +               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
11766 +                    virt_to_mfn(pfn_to_mfn_frame_list_list);
11767 +
11768 +               fpp = PAGE_SIZE/sizeof(unsigned long);
11769 +               for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
11770 +                       if ((j % fpp) == 0) {
11771 +                               k++;
11772 +                               BUG_ON(k>=16);
11773 +                               pfn_to_mfn_frame_list[k] =
11774 +                                       alloc_bootmem_low_pages(PAGE_SIZE);
11775 +                               pfn_to_mfn_frame_list_list[k] =
11776 +                                       virt_to_mfn(pfn_to_mfn_frame_list[k]);
11777 +                               j=0;
11778 +                       }
11779 +                       pfn_to_mfn_frame_list[k][j] =
11780 +                               virt_to_mfn(&phys_to_machine_mapping[i]);
11781 +               }
11782 +               HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
11783 +       }
11784 +
11785 +       /*
11786 +        * NOTE: at this point the bootmem allocator is fully available.
11787 +        */
11788 +
11789 +#ifdef CONFIG_EARLY_PRINTK
11790 +       {
11791 +               char *s = strstr(*cmdline_p, "earlyprintk=");
11792 +               if (s) {
11793 +                       extern void setup_early_printk(char *);
11794 +
11795 +                       setup_early_printk(strchr(s, '=') + 1);
11796 +                       printk("early console enabled\n");
11797 +               }
11798 +       }
11799 +#endif
11800 +
11801 +       if (xen_start_info->flags & SIF_INITDOMAIN)
11802 +               dmi_scan_machine();
11803 +
11804 +#ifdef CONFIG_X86_GENERICARCH
11805 +       generic_apic_probe(*cmdline_p);
11806 +#endif 
11807 +       if (efi_enabled)
11808 +               efi_map_memmap();
11809 +
11810 +       op.cmd             = PHYSDEVOP_SET_IOPL;
11811 +       op.u.set_iopl.iopl = 1;
11812 +       HYPERVISOR_physdev_op(&op);
11813 +
11814 +#ifdef CONFIG_X86_IO_APIC
11815 +       check_acpi_pci();       /* Checks more than just ACPI actually */
11816 +#endif
11817 +
11818 +#ifdef CONFIG_ACPI
11819 +       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
11820 +               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
11821 +               acpi_disabled = 1;
11822 +               acpi_ht = 0;
11823 +       }
11824 +
11825 +       /*
11826 +        * Parse the ACPI tables for possible boot-time SMP configuration.
11827 +        */
11828 +       acpi_boot_table_init();
11829 +       acpi_boot_init();
11830 +
11831 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
11832 +       if (def_to_bigsmp)
11833 +               printk(KERN_WARNING "More than 8 CPUs detected and "
11834 +                       "CONFIG_X86_PC cannot handle it.\nUse "
11835 +                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
11836 +#endif
11837 +#endif
11838 +#ifdef CONFIG_X86_LOCAL_APIC
11839 +       if (smp_found_config)
11840 +               get_smp_config();
11841 +#endif
11842 +
11843 +       /* XXX Disable irqdebug until we have a way to avoid interrupt
11844 +        * conflicts. */
11845 +       noirqdebug_setup("");
11846 +
11847 +       register_memory();
11848 +
11849 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
11850 +               if (!(xen_start_info->flags & SIF_PRIVILEGED))
11851 +                       panic("Xen granted us console access "
11852 +                             "but not privileged status");
11853 +
11854 +#ifdef CONFIG_VT
11855 +#if defined(CONFIG_VGA_CONSOLE)
11856 +               if (!efi_enabled ||
11857 +                   (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
11858 +                       conswitchp = &vga_con;
11859 +#elif defined(CONFIG_DUMMY_CONSOLE)
11860 +               conswitchp = &dummy_con;
11861 +#endif
11862 +#endif
11863 +       } else {
11864 +               extern int console_use_vt;
11865 +               console_use_vt = 0;
11866 +       }
11867 +}
11868 +
11869 +static int
11870 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
11871 +{
11872 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
11873 +       /* we're never actually going to get here... */
11874 +       return NOTIFY_DONE;
11875 +}
11876 +
11877 +#include "setup_arch_post.h"
11878 +/*
11879 + * Local Variables:
11880 + * mode:c
11881 + * c-file-style:"k&r"
11882 + * c-basic-offset:8
11883 + * End:
11884 + */
11885 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/smpalts.c tmp-linux-2.6-xen.patch/arch/i386/kernel/smpalts.c
11886 --- ref-linux-2.6.16.9/arch/i386/kernel/smpalts.c       1970-01-01 01:00:00.000000000 +0100
11887 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/smpalts.c  2006-04-10 00:05:52.000000000 +0200
11888 @@ -0,0 +1,85 @@
11889 +#include <linux/kernel.h>
11890 +#include <asm/system.h>
11891 +#include <asm/smp_alt.h>
11892 +#include <asm/processor.h>
11893 +#include <asm/string.h>
11894 +
11895 +struct smp_replacement_record {
11896 +       unsigned char targ_size;
11897 +       unsigned char smp1_size;
11898 +       unsigned char smp2_size;
11899 +       unsigned char up_size;
11900 +       unsigned char feature;
11901 +       unsigned char data[0];
11902 +};
11903 +
11904 +struct smp_alternative_record {
11905 +       void *targ_start;
11906 +       struct smp_replacement_record *repl;
11907 +};
11908 +
11909 +extern struct smp_alternative_record __start_smp_alternatives_table,
11910 +  __stop_smp_alternatives_table;
11911 +extern unsigned long __init_begin, __init_end;
11912 +
11913 +void prepare_for_smp(void)
11914 +{
11915 +       struct smp_alternative_record *r;
11916 +       printk(KERN_INFO "Enabling SMP...\n");
11917 +       for (r = &__start_smp_alternatives_table;
11918 +            r != &__stop_smp_alternatives_table;
11919 +            r++) {
11920 +               BUG_ON(r->repl->targ_size < r->repl->smp1_size);
11921 +               BUG_ON(r->repl->targ_size < r->repl->smp2_size);
11922 +               BUG_ON(r->repl->targ_size < r->repl->up_size);
11923 +               if (system_state == SYSTEM_RUNNING &&
11924 +                   r->targ_start >= (void *)&__init_begin &&
11925 +                   r->targ_start < (void *)&__init_end)
11926 +                       continue;
11927 +               if (r->repl->feature != (unsigned char)-1 &&
11928 +                   boot_cpu_has(r->repl->feature)) {
11929 +                       memcpy(r->targ_start,
11930 +                              r->repl->data + r->repl->smp1_size,
11931 +                              r->repl->smp2_size);
11932 +                       memset(r->targ_start + r->repl->smp2_size,
11933 +                              0x90,
11934 +                              r->repl->targ_size - r->repl->smp2_size);
11935 +               } else {
11936 +                       memcpy(r->targ_start,
11937 +                              r->repl->data,
11938 +                              r->repl->smp1_size);
11939 +                       memset(r->targ_start + r->repl->smp1_size,
11940 +                              0x90,
11941 +                              r->repl->targ_size - r->repl->smp1_size);
11942 +               }
11943 +       }
11944 +       /* Paranoia */
11945 +       asm volatile ("jmp 1f\n1:");
11946 +       mb();
11947 +}
11948 +
11949 +void unprepare_for_smp(void)
11950 +{
11951 +       struct smp_alternative_record *r;
11952 +       printk(KERN_INFO "Disabling SMP...\n");
11953 +       for (r = &__start_smp_alternatives_table;
11954 +            r != &__stop_smp_alternatives_table;
11955 +            r++) {
11956 +               BUG_ON(r->repl->targ_size < r->repl->smp1_size);
11957 +               BUG_ON(r->repl->targ_size < r->repl->smp2_size);
11958 +               BUG_ON(r->repl->targ_size < r->repl->up_size);
11959 +               if (system_state == SYSTEM_RUNNING &&
11960 +                   r->targ_start >= (void *)&__init_begin &&
11961 +                   r->targ_start < (void *)&__init_end)
11962 +                       continue;
11963 +               memcpy(r->targ_start,
11964 +                      r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
11965 +                      r->repl->up_size);
11966 +               memset(r->targ_start + r->repl->up_size,
11967 +                      0x90,
11968 +                      r->repl->targ_size - r->repl->up_size);
11969 +       }
11970 +       /* Paranoia */
11971 +       asm volatile ("jmp 1f\n1:");
11972 +       mb();
11973 +}
11974 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/smpboot.c tmp-linux-2.6-xen.patch/arch/i386/kernel/smpboot.c
11975 --- ref-linux-2.6.16.9/arch/i386/kernel/smpboot.c       2006-04-19 08:10:14.000000000 +0200
11976 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/smpboot.c  2006-04-10 00:05:52.000000000 +0200
11977 @@ -1218,6 +1218,11 @@ static void __init smp_boot_cpus(unsigne
11978                 if (max_cpus <= cpucount+1)
11979                         continue;
11980  
11981 +#ifdef CONFIG_SMP_ALTERNATIVES
11982 +               if (kicked == 1)
11983 +                       prepare_for_smp();
11984 +#endif
11985 +
11986                 if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
11987                         printk("CPU #%d not responding - cannot use it.\n",
11988                                                                 apicid);
11989 @@ -1396,6 +1401,11 @@ int __devinit __cpu_up(unsigned int cpu)
11990                 return -EIO;
11991         }
11992  
11993 +#ifdef CONFIG_SMP_ALTERNATIVES
11994 +       if (num_online_cpus() == 1)
11995 +               prepare_for_smp();
11996 +#endif
11997 +
11998         local_irq_enable();
11999         per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
12000         /* Unleash the CPU! */
12001 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/smp-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/smp-xen.c
12002 --- ref-linux-2.6.16.9/arch/i386/kernel/smp-xen.c       1970-01-01 01:00:00.000000000 +0100
12003 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/smp-xen.c  2006-04-10 00:05:52.000000000 +0200
12004 @@ -0,0 +1,617 @@
12005 +/*
12006 + *     Intel SMP support routines.
12007 + *
12008 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
12009 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
12010 + *
12011 + *     This code is released under the GNU General Public License version 2 or
12012 + *     later.
12013 + */
12014 +
12015 +#include <linux/init.h>
12016 +
12017 +#include <linux/mm.h>
12018 +#include <linux/delay.h>
12019 +#include <linux/spinlock.h>
12020 +#include <linux/smp_lock.h>
12021 +#include <linux/kernel_stat.h>
12022 +#include <linux/mc146818rtc.h>
12023 +#include <linux/cache.h>
12024 +#include <linux/interrupt.h>
12025 +#include <linux/cpu.h>
12026 +#include <linux/module.h>
12027 +
12028 +#include <asm/mtrr.h>
12029 +#include <asm/tlbflush.h>
12030 +#if 0
12031 +#include <mach_apic.h>
12032 +#endif
12033 +#include <xen/evtchn.h>
12034 +
12035 +/*
12036 + *     Some notes on x86 processor bugs affecting SMP operation:
12037 + *
12038 + *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
12039 + *     The Linux implications for SMP are handled as follows:
12040 + *
12041 + *     Pentium III / [Xeon]
12042 + *             None of the E1AP-E3AP errata are visible to the user.
12043 + *
12044 + *     E1AP.   see PII A1AP
12045 + *     E2AP.   see PII A2AP
12046 + *     E3AP.   see PII A3AP
12047 + *
12048 + *     Pentium II / [Xeon]
12049 + *             None of the A1AP-A3AP errata are visible to the user.
12050 + *
12051 + *     A1AP.   see PPro 1AP
12052 + *     A2AP.   see PPro 2AP
12053 + *     A3AP.   see PPro 7AP
12054 + *
12055 + *     Pentium Pro
12056 + *             None of 1AP-9AP errata are visible to the normal user,
12057 + *     except occasional delivery of 'spurious interrupt' as trap #15.
12058 + *     This is very rare and a non-problem.
12059 + *
12060 + *     1AP.    Linux maps APIC as non-cacheable
12061 + *     2AP.    worked around in hardware
12062 + *     3AP.    fixed in C0 and above steppings microcode update.
12063 + *             Linux does not use excessive STARTUP_IPIs.
12064 + *     4AP.    worked around in hardware
12065 + *     5AP.    symmetric IO mode (normal Linux operation) not affected.
12066 + *             'noapic' mode has vector 0xf filled out properly.
12067 + *     6AP.    'noapic' mode might be affected - fixed in later steppings
12068 + *     7AP.    We do not assume writes to the LVT deassering IRQs
12069 + *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
12070 + *     9AP.    We do not use mixed mode
12071 + *
12072 + *     Pentium
12073 + *             There is a marginal case where REP MOVS on 100MHz SMP
12074 + *     machines with B stepping processors can fail. XXX should provide
12075 + *     an L1cache=Writethrough or L1cache=off option.
12076 + *
12077 + *             B stepping CPUs may hang. There are hardware work arounds
12078 + *     for this. We warn about it in case your board doesn't have the work
12079 + *     arounds. Basically thats so I can tell anyone with a B stepping
12080 + *     CPU and SMP problems "tough".
12081 + *
12082 + *     Specific items [From Pentium Processor Specification Update]
12083 + *
12084 + *     1AP.    Linux doesn't use remote read
12085 + *     2AP.    Linux doesn't trust APIC errors
12086 + *     3AP.    We work around this
12087 + *     4AP.    Linux never generated 3 interrupts of the same priority
12088 + *             to cause a lost local interrupt.
12089 + *     5AP.    Remote read is never used
12090 + *     6AP.    not affected - worked around in hardware
12091 + *     7AP.    not affected - worked around in hardware
12092 + *     8AP.    worked around in hardware - we get explicit CS errors if not
12093 + *     9AP.    only 'noapic' mode affected. Might generate spurious
12094 + *             interrupts, we log only the first one and count the
12095 + *             rest silently.
12096 + *     10AP.   not affected - worked around in hardware
12097 + *     11AP.   Linux reads the APIC between writes to avoid this, as per
12098 + *             the documentation. Make sure you preserve this as it affects
12099 + *             the C stepping chips too.
12100 + *     12AP.   not affected - worked around in hardware
12101 + *     13AP.   not affected - worked around in hardware
12102 + *     14AP.   we always deassert INIT during bootup
12103 + *     15AP.   not affected - worked around in hardware
12104 + *     16AP.   not affected - worked around in hardware
12105 + *     17AP.   not affected - worked around in hardware
12106 + *     18AP.   not affected - worked around in hardware
12107 + *     19AP.   not affected - worked around in BIOS
12108 + *
12109 + *     If this sounds worrying believe me these bugs are either ___RARE___,
12110 + *     or are signal timing bugs worked around in hardware and there's
12111 + *     about nothing of note with C stepping upwards.
12112 + */
12113 +
12114 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
12115 +
12116 +/*
12117 + * the following functions deal with sending IPIs between CPUs.
12118 + *
12119 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
12120 + */
12121 +
12122 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
12123 +{
12124 +       return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
12125 +}
12126 +
12127 +static inline int __prepare_ICR2 (unsigned int mask)
12128 +{
12129 +       return SET_APIC_DEST_FIELD(mask);
12130 +}
12131 +
12132 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
12133 +
12134 +static inline void __send_IPI_one(unsigned int cpu, int vector)
12135 +{
12136 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
12137 +       BUG_ON(irq < 0);
12138 +       notify_remote_via_irq(irq);
12139 +}
12140 +
12141 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
12142 +{
12143 +       int cpu;
12144 +
12145 +       switch (shortcut) {
12146 +       case APIC_DEST_SELF:
12147 +               __send_IPI_one(smp_processor_id(), vector);
12148 +               break;
12149 +       case APIC_DEST_ALLBUT:
12150 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12151 +                       if (cpu == smp_processor_id())
12152 +                               continue;
12153 +                       if (cpu_isset(cpu, cpu_online_map)) {
12154 +                               __send_IPI_one(cpu, vector);
12155 +                       }
12156 +               }
12157 +               break;
12158 +       default:
12159 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
12160 +                      vector);
12161 +               break;
12162 +       }
12163 +}
12164 +
12165 +void fastcall send_IPI_self(int vector)
12166 +{
12167 +       __send_IPI_shortcut(APIC_DEST_SELF, vector);
12168 +}
12169 +
12170 +/*
12171 + * This is only used on smaller machines.
12172 + */
12173 +void send_IPI_mask_bitmask(cpumask_t mask, int vector)
12174 +{
12175 +       unsigned long flags;
12176 +       unsigned int cpu;
12177 +
12178 +       local_irq_save(flags);
12179 +       WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
12180 +
12181 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12182 +               if (cpu_isset(cpu, mask)) {
12183 +                       __send_IPI_one(cpu, vector);
12184 +               }
12185 +       }
12186 +
12187 +       local_irq_restore(flags);
12188 +}
12189 +
12190 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
12191 +{
12192 +
12193 +       send_IPI_mask_bitmask(mask, vector);
12194 +}
12195 +
12196 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
12197 +
12198 +#if 0 /* XEN */
12199 +/*
12200 + *     Smarter SMP flushing macros. 
12201 + *             c/o Linus Torvalds.
12202 + *
12203 + *     These mean you can really definitely utterly forget about
12204 + *     writing to user space from interrupts. (Its not allowed anyway).
12205 + *
12206 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
12207 + */
12208 +
12209 +static cpumask_t flush_cpumask;
12210 +static struct mm_struct * flush_mm;
12211 +static unsigned long flush_va;
12212 +static DEFINE_SPINLOCK(tlbstate_lock);
12213 +#define FLUSH_ALL      0xffffffff
12214 +
12215 +/*
12216 + * We cannot call mmdrop() because we are in interrupt context, 
12217 + * instead update mm->cpu_vm_mask.
12218 + *
12219 + * We need to reload %cr3 since the page tables may be going
12220 + * away from under us..
12221 + */
12222 +static inline void leave_mm (unsigned long cpu)
12223 +{
12224 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
12225 +               BUG();
12226 +       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
12227 +       load_cr3(swapper_pg_dir);
12228 +}
12229 +
12230 +/*
12231 + *
12232 + * The flush IPI assumes that a thread switch happens in this order:
12233 + * [cpu0: the cpu that switches]
12234 + * 1) switch_mm() either 1a) or 1b)
12235 + * 1a) thread switch to a different mm
12236 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
12237 + *     Stop ipi delivery for the old mm. This is not synchronized with
12238 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
12239 + *     for the wrong mm, and in the worst case we perform a superflous
12240 + *     tlb flush.
12241 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
12242 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
12243 + *     was in lazy tlb mode.
12244 + * 1a3) update cpu_tlbstate[].active_mm
12245 + *     Now cpu0 accepts tlb flushes for the new mm.
12246 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
12247 + *     Now the other cpus will send tlb flush ipis.
12248 + * 1a4) change cr3.
12249 + * 1b) thread switch without mm change
12250 + *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
12251 + *     flush ipis.
12252 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
12253 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
12254 + *     Atomically set the bit [other cpus will start sending flush ipis],
12255 + *     and test the bit.
12256 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
12257 + * 2) switch %%esp, ie current
12258 + *
12259 + * The interrupt must handle 2 special cases:
12260 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
12261 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
12262 + *   runs in kernel space, the cpu could load tlb entries for user space
12263 + *   pages.
12264 + *
12265 + * The good news is that cpu_tlbstate is local to each cpu, no
12266 + * write/read ordering problems.
12267 + */
12268 +
12269 +/*
12270 + * TLB flush IPI:
12271 + *
12272 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
12273 + * 2) Leave the mm if we are in the lazy tlb mode.
12274 + */
12275 +
12276 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12277 +                                    struct pt_regs *regs)
12278 +{
12279 +       unsigned long cpu;
12280 +
12281 +       cpu = get_cpu();
12282 +
12283 +       if (!cpu_isset(cpu, flush_cpumask))
12284 +               goto out;
12285 +               /* 
12286 +                * This was a BUG() but until someone can quote me the
12287 +                * line from the intel manual that guarantees an IPI to
12288 +                * multiple CPUs is retried _only_ on the erroring CPUs
12289 +                * its staying as a return
12290 +                *
12291 +                * BUG();
12292 +                */
12293 +                
12294 +       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
12295 +               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
12296 +                       if (flush_va == FLUSH_ALL)
12297 +                               local_flush_tlb();
12298 +                       else
12299 +                               __flush_tlb_one(flush_va);
12300 +               } else
12301 +                       leave_mm(cpu);
12302 +       }
12303 +       smp_mb__before_clear_bit();
12304 +       cpu_clear(cpu, flush_cpumask);
12305 +       smp_mb__after_clear_bit();
12306 +out:
12307 +       put_cpu_no_resched();
12308 +
12309 +       return IRQ_HANDLED;
12310 +}
12311 +
12312 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
12313 +                                               unsigned long va)
12314 +{
12315 +       /*
12316 +        * A couple of (to be removed) sanity checks:
12317 +        *
12318 +        * - current CPU must not be in mask
12319 +        * - mask must exist :)
12320 +        */
12321 +       BUG_ON(cpus_empty(cpumask));
12322 +       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
12323 +       BUG_ON(!mm);
12324 +
12325 +       /* If a CPU which we ran on has gone down, OK. */
12326 +       cpus_and(cpumask, cpumask, cpu_online_map);
12327 +       if (cpus_empty(cpumask))
12328 +               return;
12329 +
12330 +       /*
12331 +        * i'm not happy about this global shared spinlock in the
12332 +        * MM hot path, but we'll see how contended it is.
12333 +        * Temporarily this turns IRQs off, so that lockups are
12334 +        * detected by the NMI watchdog.
12335 +        */
12336 +       spin_lock(&tlbstate_lock);
12337 +       
12338 +       flush_mm = mm;
12339 +       flush_va = va;
12340 +#if NR_CPUS <= BITS_PER_LONG
12341 +       atomic_set_mask(cpumask, &flush_cpumask);
12342 +#else
12343 +       {
12344 +               int k;
12345 +               unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
12346 +               unsigned long *cpu_mask = (unsigned long *)&cpumask;
12347 +               for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
12348 +                       atomic_set_mask(cpu_mask[k], &flush_mask[k]);
12349 +       }
12350 +#endif
12351 +       /*
12352 +        * We have to send the IPI only to
12353 +        * CPUs affected.
12354 +        */
12355 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
12356 +
12357 +       while (!cpus_empty(flush_cpumask))
12358 +               /* nothing. lockup detection does not belong here */
12359 +               mb();
12360 +
12361 +       flush_mm = NULL;
12362 +       flush_va = 0;
12363 +       spin_unlock(&tlbstate_lock);
12364 +}
12365 +       
12366 +void flush_tlb_current_task(void)
12367 +{
12368 +       struct mm_struct *mm = current->mm;
12369 +       cpumask_t cpu_mask;
12370 +
12371 +       preempt_disable();
12372 +       cpu_mask = mm->cpu_vm_mask;
12373 +       cpu_clear(smp_processor_id(), cpu_mask);
12374 +
12375 +       local_flush_tlb();
12376 +       if (!cpus_empty(cpu_mask))
12377 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
12378 +       preempt_enable();
12379 +}
12380 +
12381 +void flush_tlb_mm (struct mm_struct * mm)
12382 +{
12383 +       cpumask_t cpu_mask;
12384 +
12385 +       preempt_disable();
12386 +       cpu_mask = mm->cpu_vm_mask;
12387 +       cpu_clear(smp_processor_id(), cpu_mask);
12388 +
12389 +       if (current->active_mm == mm) {
12390 +               if (current->mm)
12391 +                       local_flush_tlb();
12392 +               else
12393 +                       leave_mm(smp_processor_id());
12394 +       }
12395 +       if (!cpus_empty(cpu_mask))
12396 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
12397 +
12398 +       preempt_enable();
12399 +}
12400 +
12401 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
12402 +{
12403 +       struct mm_struct *mm = vma->vm_mm;
12404 +       cpumask_t cpu_mask;
12405 +
12406 +       preempt_disable();
12407 +       cpu_mask = mm->cpu_vm_mask;
12408 +       cpu_clear(smp_processor_id(), cpu_mask);
12409 +
12410 +       if (current->active_mm == mm) {
12411 +               if(current->mm)
12412 +                       __flush_tlb_one(va);
12413 +               else
12414 +                       leave_mm(smp_processor_id());
12415 +       }
12416 +
12417 +       if (!cpus_empty(cpu_mask))
12418 +               flush_tlb_others(cpu_mask, mm, va);
12419 +
12420 +       preempt_enable();
12421 +}
12422 +EXPORT_SYMBOL(flush_tlb_page);
12423 +
12424 +static void do_flush_tlb_all(void* info)
12425 +{
12426 +       unsigned long cpu = smp_processor_id();
12427 +
12428 +       __flush_tlb_all();
12429 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
12430 +               leave_mm(cpu);
12431 +}
12432 +
12433 +void flush_tlb_all(void)
12434 +{
12435 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
12436 +}
12437 +
12438 +#else
12439 +
12440 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12441 +                                    struct pt_regs *regs)
12442 +{ return 0; }
12443 +void flush_tlb_current_task(void)
12444 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
12445 +void flush_tlb_mm(struct mm_struct * mm)
12446 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
12447 +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
12448 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
12449 +void flush_tlb_all(void)
12450 +{ xen_tlb_flush_all(); }
12451 +
12452 +#endif /* XEN */
12453 +
12454 +/*
12455 + * this function sends a 'reschedule' IPI to another CPU.
12456 + * it goes straight through and wastes no time serializing
12457 + * anything. Worst case is that we lose a reschedule ...
12458 + */
12459 +void smp_send_reschedule(int cpu)
12460 +{
12461 +       WARN_ON(cpu_is_offline(cpu));
12462 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
12463 +}
12464 +
12465 +/*
12466 + * Structure and data for smp_call_function(). This is designed to minimise
12467 + * static memory requirements. It also looks cleaner.
12468 + */
12469 +static DEFINE_SPINLOCK(call_lock);
12470 +
12471 +struct call_data_struct {
12472 +       void (*func) (void *info);
12473 +       void *info;
12474 +       atomic_t started;
12475 +       atomic_t finished;
12476 +       int wait;
12477 +};
12478 +
12479 +void lock_ipi_call_lock(void)
12480 +{
12481 +       spin_lock_irq(&call_lock);
12482 +}
12483 +
12484 +void unlock_ipi_call_lock(void)
12485 +{
12486 +       spin_unlock_irq(&call_lock);
12487 +}
12488 +
12489 +static struct call_data_struct * call_data;
12490 +
12491 +/*
12492 + * this function sends a 'generic call function' IPI to all other CPUs
12493 + * in the system.
12494 + */
12495 +
12496 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
12497 +                       int wait)
12498 +/*
12499 + * [SUMMARY] Run a function on all other CPUs.
12500 + * <func> The function to run. This must be fast and non-blocking.
12501 + * <info> An arbitrary pointer to pass to the function.
12502 + * <nonatomic> currently unused.
12503 + * <wait> If true, wait (atomically) until function has completed on other CPUs.
12504 + * [RETURNS] 0 on success, else a negative status code. Does not return until
12505 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
12506 + *
12507 + * You must not call this function with disabled interrupts or from a
12508 + * hardware interrupt handler or from a bottom half handler.
12509 + */
12510 +{
12511 +       struct call_data_struct data;
12512 +       int cpus;
12513 +
12514 +       /* Holding any lock stops cpus from going down. */
12515 +       spin_lock(&call_lock);
12516 +       cpus = num_online_cpus() - 1;
12517 +       if (!cpus) {
12518 +               spin_unlock(&call_lock);
12519 +               return 0;
12520 +       }
12521 +
12522 +       /* Can deadlock when called with interrupts disabled */
12523 +       WARN_ON(irqs_disabled());
12524 +
12525 +       data.func = func;
12526 +       data.info = info;
12527 +       atomic_set(&data.started, 0);
12528 +       data.wait = wait;
12529 +       if (wait)
12530 +               atomic_set(&data.finished, 0);
12531 +
12532 +       call_data = &data;
12533 +       mb();
12534 +       
12535 +       /* Send a message to all other CPUs and wait for them to respond */
12536 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
12537 +
12538 +       /* Wait for response */
12539 +       while (atomic_read(&data.started) != cpus)
12540 +               barrier();
12541 +
12542 +       if (wait)
12543 +               while (atomic_read(&data.finished) != cpus)
12544 +                       barrier();
12545 +       spin_unlock(&call_lock);
12546 +
12547 +       return 0;
12548 +}
12549 +EXPORT_SYMBOL(smp_call_function);
12550 +
12551 +static void stop_this_cpu (void * dummy)
12552 +{
12553 +       /*
12554 +        * Remove this CPU:
12555 +        */
12556 +       cpu_clear(smp_processor_id(), cpu_online_map);
12557 +       local_irq_disable();
12558 +#if 0
12559 +       disable_local_APIC();
12560 +#endif
12561 +       if (cpu_data[smp_processor_id()].hlt_works_ok)
12562 +               for(;;) halt();
12563 +       for (;;);
12564 +}
12565 +
12566 +/*
12567 + * this function calls the 'stop' function on all other CPUs in the system.
12568 + */
12569 +
12570 +void smp_send_stop(void)
12571 +{
12572 +       smp_call_function(stop_this_cpu, NULL, 1, 0);
12573 +
12574 +       local_irq_disable();
12575 +#if 0
12576 +       disable_local_APIC();
12577 +#endif
12578 +       local_irq_enable();
12579 +}
12580 +
12581 +/*
12582 + * Reschedule call back. Nothing to do,
12583 + * all the work is done automatically when
12584 + * we return from the interrupt.
12585 + */
12586 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
12587 +                                    struct pt_regs *regs)
12588 +{
12589 +
12590 +       return IRQ_HANDLED;
12591 +}
12592 +
12593 +#include <linux/kallsyms.h>
12594 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
12595 +                                       struct pt_regs *regs)
12596 +{
12597 +       void (*func) (void *info) = call_data->func;
12598 +       void *info = call_data->info;
12599 +       int wait = call_data->wait;
12600 +
12601 +       /*
12602 +        * Notify initiating CPU that I've grabbed the data and am
12603 +        * about to execute the function
12604 +        */
12605 +       mb();
12606 +       atomic_inc(&call_data->started);
12607 +       /*
12608 +        * At this point the info structure may be out of scope unless wait==1
12609 +        */
12610 +       irq_enter();
12611 +       (*func)(info);
12612 +       irq_exit();
12613 +
12614 +       if (wait) {
12615 +               mb();
12616 +               atomic_inc(&call_data->finished);
12617 +       }
12618 +
12619 +       return IRQ_HANDLED;
12620 +}
12621 +
12622 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/swiotlb.c tmp-linux-2.6-xen.patch/arch/i386/kernel/swiotlb.c
12623 --- ref-linux-2.6.16.9/arch/i386/kernel/swiotlb.c       1970-01-01 01:00:00.000000000 +0100
12624 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/swiotlb.c  2006-04-10 00:05:52.000000000 +0200
12625 @@ -0,0 +1,674 @@
12626 +/*
12627 + * Dynamic DMA mapping support.
12628 + *
12629 + * This implementation is a fallback for platforms that do not support
12630 + * I/O TLBs (aka DMA address translation hardware).
12631 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
12632 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
12633 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
12634 + *     David Mosberger-Tang <davidm@hpl.hp.com>
12635 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
12636 + */
12637 +
12638 +#include <linux/cache.h>
12639 +#include <linux/mm.h>
12640 +#include <linux/module.h>
12641 +#include <linux/pci.h>
12642 +#include <linux/spinlock.h>
12643 +#include <linux/string.h>
12644 +#include <linux/types.h>
12645 +#include <linux/ctype.h>
12646 +#include <linux/init.h>
12647 +#include <linux/bootmem.h>
12648 +#include <linux/highmem.h>
12649 +#include <asm/io.h>
12650 +#include <asm/pci.h>
12651 +#include <asm/dma.h>
12652 +#include <asm/uaccess.h>
12653 +#include <xen/interface/memory.h>
12654 +
12655 +int swiotlb;
12656 +EXPORT_SYMBOL(swiotlb);
12657 +
12658 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
12659 +
12660 +#define SG_ENT_PHYS_ADDRESS(sg)        (page_to_phys((sg)->page) + (sg)->offset)
12661 +
12662 +/*
12663 + * Maximum allowable number of contiguous slabs to map,
12664 + * must be a power of 2.  What is the appropriate value ?
12665 + * The complexity of {map,unmap}_single is linearly dependent on this value.
12666 + */
12667 +#define IO_TLB_SEGSIZE 128
12668 +
12669 +/*
12670 + * log of the size of each IO TLB slab.  The number of slabs is command line
12671 + * controllable.
12672 + */
12673 +#define IO_TLB_SHIFT 11
12674 +
12675 +static int swiotlb_force;
12676 +static char *iotlb_virt_start;
12677 +static unsigned long iotlb_nslabs;
12678 +
12679 +/*
12680 + * Used to do a quick range check in swiotlb_unmap_single and
12681 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
12682 + * API.
12683 + */
12684 +static dma_addr_t iotlb_bus_start, iotlb_bus_end, iotlb_bus_mask;
12685 +
12686 +/* Does the given dma address reside within the swiotlb aperture? */
12687 +#define in_swiotlb_aperture(a) (!(((a) ^ iotlb_bus_start) & iotlb_bus_mask))
12688 +
12689 +/*
12690 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
12691 + */
12692 +static unsigned long io_tlb_overflow = 32*1024;
12693 +
12694 +void *io_tlb_overflow_buffer;
12695 +
12696 +/*
12697 + * This is a free list describing the number of free entries available from
12698 + * each index
12699 + */
12700 +static unsigned int *io_tlb_list;
12701 +static unsigned int io_tlb_index;
12702 +
12703 +/*
12704 + * We need to save away the original address corresponding to a mapped entry
12705 + * for the sync operations.
12706 + */
12707 +static struct phys_addr {
12708 +       struct page *page;
12709 +       unsigned int offset;
12710 +} *io_tlb_orig_addr;
12711 +
12712 +/*
12713 + * Protect the above data structures in the map and unmap calls
12714 + */
12715 +static DEFINE_SPINLOCK(io_tlb_lock);
12716 +
12717 +static int __init
12718 +setup_io_tlb_npages(char *str)
12719 +{
12720 +       /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
12721 +       if (isdigit(*str)) {
12722 +               iotlb_nslabs = simple_strtoul(str, &str, 0) <<
12723 +                       (20 - IO_TLB_SHIFT);
12724 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
12725 +               /* Round up to power of two (xen_create_contiguous_region). */
12726 +               while (iotlb_nslabs & (iotlb_nslabs-1))
12727 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
12728 +       }
12729 +       if (*str == ',')
12730 +               ++str;
12731 +       /*
12732 +         * NB. 'force' enables the swiotlb, but doesn't force its use for
12733 +         * every DMA like it does on native Linux. 'off' forcibly disables
12734 +         * use of the swiotlb.
12735 +         */
12736 +       if (!strcmp(str, "force"))
12737 +               swiotlb_force = 1;
12738 +       else if (!strcmp(str, "off"))
12739 +               swiotlb_force = -1;
12740 +       return 1;
12741 +}
12742 +__setup("swiotlb=", setup_io_tlb_npages);
12743 +/* make io_tlb_overflow tunable too? */
12744 +
12745 +/*
12746 + * Statically reserve bounce buffer space and initialize bounce buffer data
12747 + * structures for the software IO TLB used to implement the PCI DMA API.
12748 + */
12749 +void
12750 +swiotlb_init_with_default_size (size_t default_size)
12751 +{
12752 +       unsigned long i, bytes;
12753 +       int rc;
12754 +
12755 +       if (!iotlb_nslabs) {
12756 +               iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
12757 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
12758 +               /* Round up to power of two (xen_create_contiguous_region). */
12759 +               while (iotlb_nslabs & (iotlb_nslabs-1))
12760 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
12761 +       }
12762 +
12763 +       bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
12764 +
12765 +       /*
12766 +        * Get IO TLB memory from the low pages
12767 +        */
12768 +       iotlb_virt_start = alloc_bootmem_low_pages(bytes);
12769 +       if (!iotlb_virt_start)
12770 +               panic("Cannot allocate SWIOTLB buffer!\n"
12771 +                     "Use dom0_mem Xen boot parameter to reserve\n"
12772 +                     "some DMA memory (e.g., dom0_mem=-128M).\n");
12773 +
12774 +       /* Hardcode 31 address bits for now: aacraid limitation. */
12775 +       rc = xen_create_contiguous_region(
12776 +               (unsigned long)iotlb_virt_start, get_order(bytes), 31);
12777 +       BUG_ON(rc);
12778 +
12779 +       /*
12780 +        * Allocate and initialize the free list array.  This array is used
12781 +        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
12782 +        */
12783 +       io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
12784 +       for (i = 0; i < iotlb_nslabs; i++)
12785 +               io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
12786 +       io_tlb_index = 0;
12787 +       io_tlb_orig_addr = alloc_bootmem(
12788 +               iotlb_nslabs * sizeof(*io_tlb_orig_addr));
12789 +
12790 +       /*
12791 +        * Get the overflow emergency buffer
12792 +        */
12793 +       io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
12794 +
12795 +       iotlb_bus_start = virt_to_bus(iotlb_virt_start);
12796 +       iotlb_bus_end   = iotlb_bus_start + bytes;
12797 +       iotlb_bus_mask  = ~(dma_addr_t)(bytes - 1);
12798 +
12799 +       printk(KERN_INFO "Software IO TLB enabled: \n"
12800 +              " Aperture:     %lu megabytes\n"
12801 +              " Bus range:    0x%016lx - 0x%016lx\n"
12802 +              " Kernel range: 0x%016lx - 0x%016lx\n",
12803 +              bytes >> 20,
12804 +              (unsigned long)iotlb_bus_start,
12805 +              (unsigned long)iotlb_bus_end,
12806 +              (unsigned long)iotlb_virt_start,
12807 +              (unsigned long)iotlb_virt_start + bytes);
12808 +}
12809 +
12810 +void
12811 +swiotlb_init(void)
12812 +{
12813 +       long ram_end;
12814 +       size_t defsz = 64 * (1 << 20); /* 64MB default size */
12815 +
12816 +       if (swiotlb_force == 1) {
12817 +               swiotlb = 1;
12818 +       } else if ((swiotlb_force != -1) &&
12819 +                  (xen_start_info->flags & SIF_INITDOMAIN)) {
12820 +               /* Domain 0 always has a swiotlb. */
12821 +               ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
12822 +               if (ram_end <= 0x7ffff)
12823 +                       defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
12824 +               swiotlb = 1;
12825 +       }
12826 +
12827 +       if (swiotlb)
12828 +               swiotlb_init_with_default_size(defsz);
12829 +       else
12830 +               printk(KERN_INFO "Software IO TLB disabled\n");
12831 +}
12832 +
12833 +/*
12834 + * We use __copy_to_user to transfer to the host buffer because the buffer
12835 + * may be mapped read-only (e.g, in blkback driver) but lower-level
12836 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
12837 + * unnecessary copy from the aperture to the host buffer, and a page fault.
12838 + */
12839 +static void
12840 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
12841 +{
12842 +       if (PageHighMem(buffer.page)) {
12843 +               size_t len, bytes;
12844 +               char *dev, *host, *kmp;
12845 +               len = size;
12846 +               while (len != 0) {
12847 +                       if (((bytes = len) + buffer.offset) > PAGE_SIZE)
12848 +                               bytes = PAGE_SIZE - buffer.offset;
12849 +                       kmp  = kmap_atomic(buffer.page, KM_SWIOTLB);
12850 +                       dev  = dma_addr + size - len;
12851 +                       host = kmp + buffer.offset;
12852 +                       if (dir == DMA_FROM_DEVICE) {
12853 +                               if (__copy_to_user(host, dev, bytes))
12854 +                                       /* inaccessible */;
12855 +                       } else
12856 +                               memcpy(dev, host, bytes);
12857 +                       kunmap_atomic(kmp, KM_SWIOTLB);
12858 +                       len -= bytes;
12859 +                       buffer.page++;
12860 +                       buffer.offset = 0;
12861 +               }
12862 +       } else {
12863 +               char *host = (char *)phys_to_virt(
12864 +                       page_to_pseudophys(buffer.page)) + buffer.offset;
12865 +               if (dir == DMA_FROM_DEVICE) {
12866 +                       if (__copy_to_user(host, dma_addr, size))
12867 +                               /* inaccessible */;
12868 +               } else if (dir == DMA_TO_DEVICE)
12869 +                       memcpy(dma_addr, host, size);
12870 +       }
12871 +}
12872 +
12873 +/*
12874 + * Allocates bounce buffer and returns its kernel virtual address.
12875 + */
12876 +static void *
12877 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
12878 +{
12879 +       unsigned long flags;
12880 +       char *dma_addr;
12881 +       unsigned int nslots, stride, index, wrap;
12882 +       int i;
12883 +
12884 +       /*
12885 +        * For mappings greater than a page, we limit the stride (and
12886 +        * hence alignment) to a page size.
12887 +        */
12888 +       nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
12889 +       if (size > PAGE_SIZE)
12890 +               stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
12891 +       else
12892 +               stride = 1;
12893 +
12894 +       BUG_ON(!nslots);
12895 +
12896 +       /*
12897 +        * Find suitable number of IO TLB entries size that will fit this
12898 +        * request and allocate a buffer from that IO TLB pool.
12899 +        */
12900 +       spin_lock_irqsave(&io_tlb_lock, flags);
12901 +       {
12902 +               wrap = index = ALIGN(io_tlb_index, stride);
12903 +
12904 +               if (index >= iotlb_nslabs)
12905 +                       wrap = index = 0;
12906 +
12907 +               do {
12908 +                       /*
12909 +                        * If we find a slot that indicates we have 'nslots'
12910 +                        * number of contiguous buffers, we allocate the
12911 +                        * buffers from that slot and mark the entries as '0'
12912 +                        * indicating unavailable.
12913 +                        */
12914 +                       if (io_tlb_list[index] >= nslots) {
12915 +                               int count = 0;
12916 +
12917 +                               for (i = index; i < (int)(index + nslots); i++)
12918 +                                       io_tlb_list[i] = 0;
12919 +                               for (i = index - 1;
12920 +                                    (OFFSET(i, IO_TLB_SEGSIZE) !=
12921 +                                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
12922 +                                    i--)
12923 +                                       io_tlb_list[i] = ++count;
12924 +                               dma_addr = iotlb_virt_start +
12925 +                                       (index << IO_TLB_SHIFT);
12926 +
12927 +                               /*
12928 +                                * Update the indices to avoid searching in
12929 +                                * the next round.
12930 +                                */
12931 +                               io_tlb_index = 
12932 +                                       ((index + nslots) < iotlb_nslabs
12933 +                                        ? (index + nslots) : 0);
12934 +
12935 +                               goto found;
12936 +                       }
12937 +                       index += stride;
12938 +                       if (index >= iotlb_nslabs)
12939 +                               index = 0;
12940 +               } while (index != wrap);
12941 +
12942 +               spin_unlock_irqrestore(&io_tlb_lock, flags);
12943 +               return NULL;
12944 +       }
12945 +  found:
12946 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
12947 +
12948 +       /*
12949 +        * Save away the mapping from the original address to the DMA address.
12950 +        * This is needed when we sync the memory.  Then we sync the buffer if
12951 +        * needed.
12952 +        */
12953 +       io_tlb_orig_addr[index] = buffer;
12954 +       if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
12955 +               __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
12956 +
12957 +       return dma_addr;
12958 +}
12959 +
12960 +/*
12961 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
12962 + */
12963 +static void
12964 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
12965 +{
12966 +       unsigned long flags;
12967 +       int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
12968 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
12969 +       struct phys_addr buffer = io_tlb_orig_addr[index];
12970 +
12971 +       /*
12972 +        * First, sync the memory before unmapping the entry
12973 +        */
12974 +       if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
12975 +               __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
12976 +
12977 +       /*
12978 +        * Return the buffer to the free list by setting the corresponding
12979 +        * entries to indicate the number of contigous entries available.
12980 +        * While returning the entries to the free list, we merge the entries
12981 +        * with slots below and above the pool being returned.
12982 +        */
12983 +       spin_lock_irqsave(&io_tlb_lock, flags);
12984 +       {
12985 +               count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
12986 +                        io_tlb_list[index + nslots] : 0);
12987 +               /*
12988 +                * Step 1: return the slots to the free list, merging the
12989 +                * slots with superceeding slots
12990 +                */
12991 +               for (i = index + nslots - 1; i >= index; i--)
12992 +                       io_tlb_list[i] = ++count;
12993 +               /*
12994 +                * Step 2: merge the returned slots with the preceding slots,
12995 +                * if available (non zero)
12996 +                */
12997 +               for (i = index - 1;
12998 +                    (OFFSET(i, IO_TLB_SEGSIZE) !=
12999 +                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13000 +                    i--)
13001 +                       io_tlb_list[i] = ++count;
13002 +       }
13003 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
13004 +}
13005 +
13006 +static void
13007 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13008 +{
13009 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13010 +       struct phys_addr buffer = io_tlb_orig_addr[index];
13011 +       BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
13012 +       __sync_single(buffer, dma_addr, size, dir);
13013 +}
13014 +
13015 +static void
13016 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
13017 +{
13018 +       /*
13019 +        * Ran out of IOMMU space for this operation. This is very bad.
13020 +        * Unfortunately the drivers cannot handle this operation properly.
13021 +        * unless they check for pci_dma_mapping_error (most don't)
13022 +        * When the mapping is small enough return a static buffer to limit
13023 +        * the damage, or panic when the transfer is too big.
13024 +        */
13025 +       printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
13026 +              "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
13027 +
13028 +       if (size > io_tlb_overflow && do_panic) {
13029 +               if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13030 +                       panic("PCI-DMA: Memory would be corrupted\n");
13031 +               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13032 +                       panic("PCI-DMA: Random memory would be DMAed\n");
13033 +       }
13034 +}
13035 +
13036 +/*
13037 + * Map a single buffer of the indicated size for DMA in streaming mode.  The
13038 + * PCI address to use is returned.
13039 + *
13040 + * Once the device is given the dma address, the device owns this memory until
13041 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
13042 + */
13043 +dma_addr_t
13044 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
13045 +{
13046 +       dma_addr_t dev_addr = virt_to_bus(ptr);
13047 +       void *map;
13048 +       struct phys_addr buffer;
13049 +
13050 +       BUG_ON(dir == DMA_NONE);
13051 +
13052 +       /*
13053 +        * If the pointer passed in happens to be in the device's DMA window,
13054 +        * we can safely return the device addr and not worry about bounce
13055 +        * buffering it.
13056 +        */
13057 +       if (!range_straddles_page_boundary(ptr, size) &&
13058 +           !address_needs_mapping(hwdev, dev_addr))
13059 +               return dev_addr;
13060 +
13061 +       /*
13062 +        * Oh well, have to allocate and map a bounce buffer.
13063 +        */
13064 +       buffer.page   = virt_to_page(ptr);
13065 +       buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
13066 +       map = map_single(hwdev, buffer, size, dir);
13067 +       if (!map) {
13068 +               swiotlb_full(hwdev, size, dir, 1);
13069 +               map = io_tlb_overflow_buffer;
13070 +       }
13071 +
13072 +       dev_addr = virt_to_bus(map);
13073 +       return dev_addr;
13074 +}
13075 +
13076 +/*
13077 + * Unmap a single streaming mode DMA translation.  The dma_addr and size must
13078 + * match what was provided for in a previous swiotlb_map_single call.  All
13079 + * other usages are undefined.
13080 + *
13081 + * After this call, reads by the cpu to the buffer are guaranteed to see
13082 + * whatever the device wrote there.
13083 + */
13084 +void
13085 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
13086 +                    int dir)
13087 +{
13088 +       BUG_ON(dir == DMA_NONE);
13089 +       if (in_swiotlb_aperture(dev_addr))
13090 +               unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
13091 +}
13092 +
13093 +/*
13094 + * Make physical memory consistent for a single streaming mode DMA translation
13095 + * after a transfer.
13096 + *
13097 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
13098 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
13099 + * call this function before doing so.  At the next point you give the PCI dma
13100 + * address back to the card, you must first perform a
13101 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
13102 + */
13103 +void
13104 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
13105 +                           size_t size, int dir)
13106 +{
13107 +       BUG_ON(dir == DMA_NONE);
13108 +       if (in_swiotlb_aperture(dev_addr))
13109 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13110 +}
13111 +
13112 +void
13113 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
13114 +                              size_t size, int dir)
13115 +{
13116 +       BUG_ON(dir == DMA_NONE);
13117 +       if (in_swiotlb_aperture(dev_addr))
13118 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13119 +}
13120 +
13121 +/*
13122 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
13123 + * This is the scatter-gather version of the above swiotlb_map_single
13124 + * interface.  Here the scatter gather list elements are each tagged with the
13125 + * appropriate dma address and length.  They are obtained via
13126 + * sg_dma_{address,length}(SG).
13127 + *
13128 + * NOTE: An implementation may be able to use a smaller number of
13129 + *       DMA address/length pairs than there are SG table elements.
13130 + *       (for example via virtual mapping capabilities)
13131 + *       The routine returns the number of addr/length pairs actually
13132 + *       used, at most nents.
13133 + *
13134 + * Device ownership issues as mentioned above for swiotlb_map_single are the
13135 + * same here.
13136 + */
13137 +int
13138 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13139 +              int dir)
13140 +{
13141 +       struct phys_addr buffer;
13142 +       dma_addr_t dev_addr;
13143 +       char *map;
13144 +       int i;
13145 +
13146 +       BUG_ON(dir == DMA_NONE);
13147 +
13148 +       for (i = 0; i < nelems; i++, sg++) {
13149 +               dev_addr = SG_ENT_PHYS_ADDRESS(sg);
13150 +               if (address_needs_mapping(hwdev, dev_addr)) {
13151 +                       buffer.page   = sg->page;
13152 +                       buffer.offset = sg->offset;
13153 +                       map = map_single(hwdev, buffer, sg->length, dir);
13154 +                       if (!map) {
13155 +                               /* Don't panic here, we expect map_sg users
13156 +                                  to do proper error handling. */
13157 +                               swiotlb_full(hwdev, sg->length, dir, 0);
13158 +                               swiotlb_unmap_sg(hwdev, sg - i, i, dir);
13159 +                               sg[0].dma_length = 0;
13160 +                               return 0;
13161 +                       }
13162 +                       sg->dma_address = (dma_addr_t)virt_to_bus(map);
13163 +               } else
13164 +                       sg->dma_address = dev_addr;
13165 +               sg->dma_length = sg->length;
13166 +       }
13167 +       return nelems;
13168 +}
13169 +
13170 +/*
13171 + * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
13172 + * concerning calls here are the same as for swiotlb_unmap_single() above.
13173 + */
13174 +void
13175 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13176 +                int dir)
13177 +{
13178 +       int i;
13179 +
13180 +       BUG_ON(dir == DMA_NONE);
13181 +
13182 +       for (i = 0; i < nelems; i++, sg++)
13183 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13184 +                       unmap_single(hwdev, 
13185 +                                    (void *)bus_to_virt(sg->dma_address),
13186 +                                    sg->dma_length, dir);
13187 +}
13188 +
13189 +/*
13190 + * Make physical memory consistent for a set of streaming mode DMA translations
13191 + * after a transfer.
13192 + *
13193 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
13194 + * and usage.
13195 + */
13196 +void
13197 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
13198 +                       int nelems, int dir)
13199 +{
13200 +       int i;
13201 +
13202 +       BUG_ON(dir == DMA_NONE);
13203 +
13204 +       for (i = 0; i < nelems; i++, sg++)
13205 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13206 +                       sync_single(hwdev,
13207 +                                   (void *)bus_to_virt(sg->dma_address),
13208 +                                   sg->dma_length, dir);
13209 +}
13210 +
13211 +void
13212 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
13213 +                          int nelems, int dir)
13214 +{
13215 +       int i;
13216 +
13217 +       BUG_ON(dir == DMA_NONE);
13218 +
13219 +       for (i = 0; i < nelems; i++, sg++)
13220 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13221 +                       sync_single(hwdev,
13222 +                                   (void *)bus_to_virt(sg->dma_address),
13223 +                                   sg->dma_length, dir);
13224 +}
13225 +
13226 +dma_addr_t
13227 +swiotlb_map_page(struct device *hwdev, struct page *page,
13228 +                unsigned long offset, size_t size,
13229 +                enum dma_data_direction direction)
13230 +{
13231 +       struct phys_addr buffer;
13232 +       dma_addr_t dev_addr;
13233 +       char *map;
13234 +
13235 +       dev_addr = page_to_phys(page) + offset;
13236 +       if (address_needs_mapping(hwdev, dev_addr)) {
13237 +               buffer.page   = page;
13238 +               buffer.offset = offset;
13239 +               map = map_single(hwdev, buffer, size, direction);
13240 +               if (!map) {
13241 +                       swiotlb_full(hwdev, size, direction, 1);
13242 +                       map = io_tlb_overflow_buffer;
13243 +               }
13244 +               dev_addr = (dma_addr_t)virt_to_bus(map);
13245 +       }
13246 +
13247 +       return dev_addr;
13248 +}
13249 +
13250 +void
13251 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
13252 +                  size_t size, enum dma_data_direction direction)
13253 +{
13254 +       BUG_ON(direction == DMA_NONE);
13255 +       if (in_swiotlb_aperture(dma_address))
13256 +               unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
13257 +}
13258 +
13259 +int
13260 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
13261 +{
13262 +       return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
13263 +}
13264 +
13265 +/*
13266 + * Return whether the given PCI device DMA address mask can be supported
13267 + * properly.  For example, if your device can only drive the low 24-bits
13268 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
13269 + * this function.
13270 + */
13271 +int
13272 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
13273 +{
13274 +       return (mask >= (iotlb_bus_end - 1));
13275 +}
13276 +
13277 +EXPORT_SYMBOL(swiotlb_init);
13278 +EXPORT_SYMBOL(swiotlb_map_single);
13279 +EXPORT_SYMBOL(swiotlb_unmap_single);
13280 +EXPORT_SYMBOL(swiotlb_map_sg);
13281 +EXPORT_SYMBOL(swiotlb_unmap_sg);
13282 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
13283 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
13284 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
13285 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
13286 +EXPORT_SYMBOL(swiotlb_map_page);
13287 +EXPORT_SYMBOL(swiotlb_unmap_page);
13288 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
13289 +EXPORT_SYMBOL(swiotlb_dma_supported);
13290 +
13291 +/*
13292 + * Local variables:
13293 + *  c-file-style: "linux"
13294 + *  indent-tabs-mode: t
13295 + *  c-indent-level: 8
13296 + *  c-basic-offset: 8
13297 + *  tab-width: 8
13298 + * End:
13299 + */
13300 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/sysenter.c tmp-linux-2.6-xen.patch/arch/i386/kernel/sysenter.c
13301 --- ref-linux-2.6.16.9/arch/i386/kernel/sysenter.c      2006-04-19 08:10:14.000000000 +0200
13302 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/sysenter.c 2006-04-10 00:05:52.000000000 +0200
13303 @@ -13,6 +13,7 @@
13304  #include <linux/gfp.h>
13305  #include <linux/string.h>
13306  #include <linux/elf.h>
13307 +#include <linux/mm.h>
13308  
13309  #include <asm/cpufeature.h>
13310  #include <asm/msr.h>
13311 @@ -23,6 +24,7 @@ extern asmlinkage void sysenter_entry(vo
13312  
13313  void enable_sep_cpu(void)
13314  {
13315 +#ifdef CONFIG_X86_SYSENTER
13316         int cpu = get_cpu();
13317         struct tss_struct *tss = &per_cpu(init_tss, cpu);
13318  
13319 @@ -37,6 +39,7 @@ void enable_sep_cpu(void)
13320         wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
13321         wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
13322         put_cpu();      
13323 +#endif
13324  }
13325  
13326  /*
13327 @@ -45,23 +48,90 @@ void enable_sep_cpu(void)
13328   */
13329  extern const char vsyscall_int80_start, vsyscall_int80_end;
13330  extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
13331 +static void *syscall_page;
13332  
13333  int __init sysenter_setup(void)
13334  {
13335 -       void *page = (void *)get_zeroed_page(GFP_ATOMIC);
13336 +       syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
13337  
13338 -       __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
13339 -
13340 -       if (!boot_cpu_has(X86_FEATURE_SEP)) {
13341 -               memcpy(page,
13342 -                      &vsyscall_int80_start,
13343 -                      &vsyscall_int80_end - &vsyscall_int80_start);
13344 +#ifdef CONFIG_X86_SYSENTER
13345 +       if (boot_cpu_has(X86_FEATURE_SEP)) {
13346 +               memcpy(syscall_page,
13347 +                      &vsyscall_sysenter_start,
13348 +                      &vsyscall_sysenter_end - &vsyscall_sysenter_start);
13349                 return 0;
13350         }
13351 +#endif
13352  
13353 -       memcpy(page,
13354 -              &vsyscall_sysenter_start,
13355 -              &vsyscall_sysenter_end - &vsyscall_sysenter_start);
13356 +       memcpy(syscall_page,
13357 +              &vsyscall_int80_start,
13358 +              &vsyscall_int80_end - &vsyscall_int80_start);
13359  
13360         return 0;
13361  }
13362 +
13363 +static struct page*
13364 +syscall_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
13365 +{
13366 +       struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
13367 +       get_page(p);
13368 +       return p;
13369 +}
13370 +
13371 +/* Prevent VMA merging */
13372 +static void syscall_vma_close(struct vm_area_struct *vma)
13373 +{
13374 +}
13375 +
13376 +static struct vm_operations_struct syscall_vm_ops = {
13377 +       .close = syscall_vma_close,
13378 +       .nopage = syscall_nopage,
13379 +};
13380 +
13381 +/* Setup a VMA at program startup for the vsyscall page */
13382 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
13383 +{
13384 +       struct vm_area_struct *vma;
13385 +       struct mm_struct *mm = current->mm;
13386 +       int ret;
13387 +
13388 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
13389 +       if (!vma)
13390 +               return -ENOMEM;
13391 +
13392 +       memset(vma, 0, sizeof(struct vm_area_struct));
13393 +       /* Could randomize here */
13394 +       vma->vm_start = VSYSCALL_BASE;
13395 +       vma->vm_end = VSYSCALL_BASE + PAGE_SIZE;
13396 +       /* MAYWRITE to allow gdb to COW and set breakpoints */
13397 +       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
13398 +       vma->vm_flags |= mm->def_flags;
13399 +       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
13400 +       vma->vm_ops = &syscall_vm_ops;
13401 +       vma->vm_mm = mm;
13402 +
13403 +       down_write(&mm->mmap_sem);
13404 +       if ((ret = insert_vm_struct(mm, vma))) {
13405 +               up_write(&mm->mmap_sem);
13406 +               kmem_cache_free(vm_area_cachep, vma);
13407 +               return ret;
13408 +       }
13409 +       mm->total_vm++;
13410 +       up_write(&mm->mmap_sem);
13411 +       return 0;
13412 +}
13413 +
13414 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
13415 +{
13416 +       return NULL;
13417 +}
13418 +
13419 +int in_gate_area(struct task_struct *task, unsigned long addr)
13420 +{
13421 +       return 0;
13422 +}
13423 +
13424 +int in_gate_area_no_task(unsigned long addr)
13425 +{
13426 +       return 0;
13427 +}
13428 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/time-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/time-xen.c
13429 --- ref-linux-2.6.16.9/arch/i386/kernel/time-xen.c      1970-01-01 01:00:00.000000000 +0100
13430 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/time-xen.c 2006-04-10 00:05:52.000000000 +0200
13431 @@ -0,0 +1,1097 @@
13432 +/*
13433 + *  linux/arch/i386/kernel/time.c
13434 + *
13435 + *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
13436 + *
13437 + * This file contains the PC-specific time handling details:
13438 + * reading the RTC at bootup, etc..
13439 + * 1994-07-02    Alan Modra
13440 + *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
13441 + * 1995-03-26    Markus Kuhn
13442 + *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
13443 + *      precision CMOS clock update
13444 + * 1996-05-03    Ingo Molnar
13445 + *      fixed time warps in do_[slow|fast]_gettimeoffset()
13446 + * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
13447 + *             "A Kernel Model for Precision Timekeeping" by Dave Mills
13448 + * 1998-09-05    (Various)
13449 + *     More robust do_fast_gettimeoffset() algorithm implemented
13450 + *     (works with APM, Cyrix 6x86MX and Centaur C6),
13451 + *     monotonic gettimeofday() with fast_get_timeoffset(),
13452 + *     drift-proof precision TSC calibration on boot
13453 + *     (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
13454 + *     Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
13455 + *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
13456 + * 1998-12-16    Andrea Arcangeli
13457 + *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
13458 + *     because was not accounting lost_ticks.
13459 + * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
13460 + *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13461 + *     serialize accesses to xtime/lost_ticks).
13462 + */
13463 +
13464 +#include <linux/errno.h>
13465 +#include <linux/sched.h>
13466 +#include <linux/kernel.h>
13467 +#include <linux/param.h>
13468 +#include <linux/string.h>
13469 +#include <linux/mm.h>
13470 +#include <linux/interrupt.h>
13471 +#include <linux/time.h>
13472 +#include <linux/delay.h>
13473 +#include <linux/init.h>
13474 +#include <linux/smp.h>
13475 +#include <linux/module.h>
13476 +#include <linux/sysdev.h>
13477 +#include <linux/bcd.h>
13478 +#include <linux/efi.h>
13479 +#include <linux/mca.h>
13480 +#include <linux/sysctl.h>
13481 +#include <linux/percpu.h>
13482 +#include <linux/kernel_stat.h>
13483 +#include <linux/posix-timers.h>
13484 +
13485 +#include <asm/io.h>
13486 +#include <asm/smp.h>
13487 +#include <asm/irq.h>
13488 +#include <asm/msr.h>
13489 +#include <asm/delay.h>
13490 +#include <asm/mpspec.h>
13491 +#include <asm/uaccess.h>
13492 +#include <asm/processor.h>
13493 +#include <asm/timer.h>
13494 +#include <asm/sections.h>
13495 +
13496 +#include "mach_time.h"
13497 +
13498 +#include <linux/timex.h>
13499 +#include <linux/config.h>
13500 +
13501 +#include <asm/hpet.h>
13502 +
13503 +#include <asm/arch_hooks.h>
13504 +
13505 +#include <xen/evtchn.h>
13506 +#include <xen/interface/vcpu.h>
13507 +
13508 +#if defined (__i386__)
13509 +#include <asm/i8259.h>
13510 +#endif
13511 +
13512 +int pit_latch_buggy;              /* extern */
13513 +
13514 +#if defined(__x86_64__)
13515 +unsigned long vxtime_hz = PIT_TICK_RATE;
13516 +struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
13517 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
13518 +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
13519 +struct timespec __xtime __section_xtime;
13520 +struct timezone __sys_tz __section_sys_tz;
13521 +#endif
13522 +
13523 +unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
13524 +EXPORT_SYMBOL(cpu_khz);
13525 +
13526 +extern unsigned long wall_jiffies;
13527 +
13528 +DEFINE_SPINLOCK(rtc_lock);
13529 +EXPORT_SYMBOL(rtc_lock);
13530 +
13531 +#if defined (__i386__)
13532 +#include <asm/i8253.h>
13533 +#endif
13534 +
13535 +DEFINE_SPINLOCK(i8253_lock);
13536 +EXPORT_SYMBOL(i8253_lock);
13537 +
13538 +extern struct init_timer_opts timer_tsc_init;
13539 +extern struct timer_opts timer_tsc;
13540 +#define timer_none timer_tsc
13541 +struct timer_opts *cur_timer __read_mostly = &timer_tsc;
13542 +
13543 +/* These are peridically updated in shared_info, and then copied here. */
13544 +struct shadow_time_info {
13545 +       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
13546 +       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
13547 +       u32 tsc_to_nsec_mul;
13548 +       u32 tsc_to_usec_mul;
13549 +       int tsc_shift;
13550 +       u32 version;
13551 +};
13552 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
13553 +static struct timespec shadow_tv;
13554 +static u32 shadow_tv_version;
13555 +
13556 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
13557 +static u64 processed_system_time;   /* System time (ns) at last processing. */
13558 +static DEFINE_PER_CPU(u64, processed_system_time);
13559 +
13560 +/* How much CPU time was spent blocked and how much was 'stolen'? */
13561 +static DEFINE_PER_CPU(u64, processed_stolen_time);
13562 +static DEFINE_PER_CPU(u64, processed_blocked_time);
13563 +
13564 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
13565 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
13566 +
13567 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
13568 +#define NS_PER_TICK (1000000000LL/HZ)
13569 +
13570 +static inline void __normalize_time(time_t *sec, s64 *nsec)
13571 +{
13572 +       while (*nsec >= NSEC_PER_SEC) {
13573 +               (*nsec) -= NSEC_PER_SEC;
13574 +               (*sec)++;
13575 +       }
13576 +       while (*nsec < 0) {
13577 +               (*nsec) += NSEC_PER_SEC;
13578 +               (*sec)--;
13579 +       }
13580 +}
13581 +
13582 +/* Does this guest OS track Xen time, or set its wall clock independently? */
13583 +static int independent_wallclock = 0;
13584 +static int __init __independent_wallclock(char *str)
13585 +{
13586 +       independent_wallclock = 1;
13587 +       return 1;
13588 +}
13589 +__setup("independent_wallclock", __independent_wallclock);
13590 +
13591 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
13592 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
13593 +static int __init __permitted_clock_jitter(char *str)
13594 +{
13595 +       permitted_clock_jitter = simple_strtoul(str, NULL, 0);
13596 +       return 1;
13597 +}
13598 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
13599 +
13600 +int tsc_disable __devinitdata = 0;
13601 +
13602 +static void delay_tsc(unsigned long loops)
13603 +{
13604 +       unsigned long bclock, now;
13605 +
13606 +       rdtscl(bclock);
13607 +       do {
13608 +               rep_nop();
13609 +               rdtscl(now);
13610 +       } while ((now - bclock) < loops);
13611 +}
13612 +
13613 +struct timer_opts timer_tsc = {
13614 +       .name = "tsc",
13615 +       .delay = delay_tsc,
13616 +};
13617 +
13618 +/*
13619 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
13620 + * yielding a 64-bit result.
13621 + */
13622 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
13623 +{
13624 +       u64 product;
13625 +#ifdef __i386__
13626 +       u32 tmp1, tmp2;
13627 +#endif
13628 +
13629 +       if (shift < 0)
13630 +               delta >>= -shift;
13631 +       else
13632 +               delta <<= shift;
13633 +
13634 +#ifdef __i386__
13635 +       __asm__ (
13636 +               "mul  %5       ; "
13637 +               "mov  %4,%%eax ; "
13638 +               "mov  %%edx,%4 ; "
13639 +               "mul  %5       ; "
13640 +               "xor  %5,%5    ; "
13641 +               "add  %4,%%eax ; "
13642 +               "adc  %5,%%edx ; "
13643 +               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
13644 +               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
13645 +#else
13646 +       __asm__ (
13647 +               "mul %%rdx ; shrd $32,%%rdx,%%rax"
13648 +               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
13649 +#endif
13650 +
13651 +       return product;
13652 +}
13653 +
13654 +#if defined (__i386__)
13655 +int read_current_timer(unsigned long *timer_val)
13656 +{
13657 +       rdtscl(*timer_val);
13658 +       return 0;
13659 +}
13660 +#endif
13661 +
13662 +void init_cpu_khz(void)
13663 +{
13664 +       u64 __cpu_khz = 1000000ULL << 32;
13665 +       struct vcpu_time_info *info;
13666 +       info = &HYPERVISOR_shared_info->vcpu_info[0].time;
13667 +       do_div(__cpu_khz, info->tsc_to_system_mul);
13668 +       if (info->tsc_shift < 0)
13669 +               cpu_khz = __cpu_khz << -info->tsc_shift;
13670 +       else
13671 +               cpu_khz = __cpu_khz >> info->tsc_shift;
13672 +}
13673 +
13674 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
13675 +{
13676 +       u64 now, delta;
13677 +       rdtscll(now);
13678 +       delta = now - shadow->tsc_timestamp;
13679 +       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
13680 +}
13681 +
13682 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
13683 +{
13684 +       u64 now, delta;
13685 +       rdtscll(now);
13686 +       delta = now - shadow->tsc_timestamp;
13687 +       return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
13688 +}
13689 +
13690 +static void __update_wallclock(time_t sec, long nsec)
13691 +{
13692 +       long wtm_nsec, xtime_nsec;
13693 +       time_t wtm_sec, xtime_sec;
13694 +       u64 tmp, wc_nsec;
13695 +
13696 +       /* Adjust wall-clock time base based on wall_jiffies ticks. */
13697 +       wc_nsec = processed_system_time;
13698 +       wc_nsec += sec * (u64)NSEC_PER_SEC;
13699 +       wc_nsec += nsec;
13700 +       wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
13701 +
13702 +       /* Split wallclock base into seconds and nanoseconds. */
13703 +       tmp = wc_nsec;
13704 +       xtime_nsec = do_div(tmp, 1000000000);
13705 +       xtime_sec  = (time_t)tmp;
13706 +
13707 +       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
13708 +       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
13709 +
13710 +       set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
13711 +       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
13712 +
13713 +       ntp_clear();
13714 +}
13715 +
13716 +static void update_wallclock(void)
13717 +{
13718 +       shared_info_t *s = HYPERVISOR_shared_info;
13719 +
13720 +       do {
13721 +               shadow_tv_version = s->wc_version;
13722 +               rmb();
13723 +               shadow_tv.tv_sec  = s->wc_sec;
13724 +               shadow_tv.tv_nsec = s->wc_nsec;
13725 +               rmb();
13726 +       } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
13727 +
13728 +       if (!independent_wallclock)
13729 +               __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
13730 +}
13731 +
13732 +/*
13733 + * Reads a consistent set of time-base values from Xen, into a shadow data
13734 + * area.
13735 + */
13736 +static void get_time_values_from_xen(void)
13737 +{
13738 +       shared_info_t           *s = HYPERVISOR_shared_info;
13739 +       struct vcpu_time_info   *src;
13740 +       struct shadow_time_info *dst;
13741 +
13742 +       src = &s->vcpu_info[smp_processor_id()].time;
13743 +       dst = &per_cpu(shadow_time, smp_processor_id());
13744 +
13745 +       do {
13746 +               dst->version = src->version;
13747 +               rmb();
13748 +               dst->tsc_timestamp     = src->tsc_timestamp;
13749 +               dst->system_timestamp  = src->system_time;
13750 +               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
13751 +               dst->tsc_shift         = src->tsc_shift;
13752 +               rmb();
13753 +       } while ((src->version & 1) | (dst->version ^ src->version));
13754 +
13755 +       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
13756 +}
13757 +
13758 +static inline int time_values_up_to_date(int cpu)
13759 +{
13760 +       struct vcpu_time_info   *src;
13761 +       struct shadow_time_info *dst;
13762 +
13763 +       src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
13764 +       dst = &per_cpu(shadow_time, cpu);
13765 +
13766 +       rmb();
13767 +       return (dst->version == src->version);
13768 +}
13769 +
13770 +/*
13771 + * This is a special lock that is owned by the CPU and holds the index
13772 + * register we are working with.  It is required for NMI access to the
13773 + * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
13774 + */
13775 +volatile unsigned long cmos_lock = 0;
13776 +EXPORT_SYMBOL(cmos_lock);
13777 +
13778 +/* Routines for accessing the CMOS RAM/RTC. */
13779 +unsigned char rtc_cmos_read(unsigned char addr)
13780 +{
13781 +       unsigned char val;
13782 +       lock_cmos_prefix(addr);
13783 +       outb_p(addr, RTC_PORT(0));
13784 +       val = inb_p(RTC_PORT(1));
13785 +       lock_cmos_suffix(addr);
13786 +       return val;
13787 +}
13788 +EXPORT_SYMBOL(rtc_cmos_read);
13789 +
13790 +void rtc_cmos_write(unsigned char val, unsigned char addr)
13791 +{
13792 +       lock_cmos_prefix(addr);
13793 +       outb_p(addr, RTC_PORT(0));
13794 +       outb_p(val, RTC_PORT(1));
13795 +       lock_cmos_suffix(addr);
13796 +}
13797 +EXPORT_SYMBOL(rtc_cmos_write);
13798 +
13799 +/*
13800 + * This version of gettimeofday has microsecond resolution
13801 + * and better than microsecond precision on fast x86 machines with TSC.
13802 + */
13803 +void do_gettimeofday(struct timeval *tv)
13804 +{
13805 +       unsigned long seq;
13806 +       unsigned long usec, sec;
13807 +       unsigned long max_ntp_tick;
13808 +       s64 nsec;
13809 +       unsigned int cpu;
13810 +       struct shadow_time_info *shadow;
13811 +       u32 local_time_version;
13812 +
13813 +       cpu = get_cpu();
13814 +       shadow = &per_cpu(shadow_time, cpu);
13815 +
13816 +       do {
13817 +               unsigned long lost;
13818 +
13819 +               local_time_version = shadow->version;
13820 +               seq = read_seqbegin(&xtime_lock);
13821 +
13822 +               usec = get_usec_offset(shadow);
13823 +               lost = jiffies - wall_jiffies;
13824 +
13825 +               /*
13826 +                * If time_adjust is negative then NTP is slowing the clock
13827 +                * so make sure not to go into next possible interval.
13828 +                * Better to lose some accuracy than have time go backwards..
13829 +                */
13830 +               if (unlikely(time_adjust < 0)) {
13831 +                       max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
13832 +                       usec = min(usec, max_ntp_tick);
13833 +
13834 +                       if (lost)
13835 +                               usec += lost * max_ntp_tick;
13836 +               }
13837 +               else if (unlikely(lost))
13838 +                       usec += lost * (USEC_PER_SEC / HZ);
13839 +
13840 +               sec = xtime.tv_sec;
13841 +               usec += (xtime.tv_nsec / NSEC_PER_USEC);
13842 +
13843 +               nsec = shadow->system_timestamp - processed_system_time;
13844 +               __normalize_time(&sec, &nsec);
13845 +               usec += (long)nsec / NSEC_PER_USEC;
13846 +
13847 +               if (unlikely(!time_values_up_to_date(cpu))) {
13848 +                       /*
13849 +                        * We may have blocked for a long time,
13850 +                        * rendering our calculations invalid
13851 +                        * (e.g. the time delta may have
13852 +                        * overflowed). Detect that and recalculate
13853 +                        * with fresh values.
13854 +                        */
13855 +                       get_time_values_from_xen();
13856 +                       continue;
13857 +               }
13858 +       } while (read_seqretry(&xtime_lock, seq) ||
13859 +                (local_time_version != shadow->version));
13860 +
13861 +       put_cpu();
13862 +
13863 +       while (usec >= USEC_PER_SEC) {
13864 +               usec -= USEC_PER_SEC;
13865 +               sec++;
13866 +       }
13867 +
13868 +       tv->tv_sec = sec;
13869 +       tv->tv_usec = usec;
13870 +}
13871 +
13872 +EXPORT_SYMBOL(do_gettimeofday);
13873 +
13874 +int do_settimeofday(struct timespec *tv)
13875 +{
13876 +       time_t sec;
13877 +       s64 nsec;
13878 +       unsigned int cpu;
13879 +       struct shadow_time_info *shadow;
13880 +       dom0_op_t op;
13881 +
13882 +       if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
13883 +               return -EINVAL;
13884 +
13885 +       cpu = get_cpu();
13886 +       shadow = &per_cpu(shadow_time, cpu);
13887 +
13888 +       write_seqlock_irq(&xtime_lock);
13889 +
13890 +       /*
13891 +        * Ensure we don't get blocked for a long time so that our time delta
13892 +        * overflows. If that were to happen then our shadow time values would
13893 +        * be stale, so we can retry with fresh ones.
13894 +        */
13895 +       for (;;) {
13896 +               nsec = tv->tv_nsec - get_nsec_offset(shadow);
13897 +               if (time_values_up_to_date(cpu))
13898 +                       break;
13899 +               get_time_values_from_xen();
13900 +       }
13901 +       sec = tv->tv_sec;
13902 +       __normalize_time(&sec, &nsec);
13903 +
13904 +       if ((xen_start_info->flags & SIF_INITDOMAIN) &&
13905 +           !independent_wallclock) {
13906 +               op.cmd = DOM0_SETTIME;
13907 +               op.u.settime.secs        = sec;
13908 +               op.u.settime.nsecs       = nsec;
13909 +               op.u.settime.system_time = shadow->system_timestamp;
13910 +               HYPERVISOR_dom0_op(&op);
13911 +               update_wallclock();
13912 +       } else if (independent_wallclock) {
13913 +               nsec -= shadow->system_timestamp;
13914 +               __normalize_time(&sec, &nsec);
13915 +               __update_wallclock(sec, nsec);
13916 +       }
13917 +
13918 +       write_sequnlock_irq(&xtime_lock);
13919 +
13920 +       put_cpu();
13921 +
13922 +       clock_was_set();
13923 +       return 0;
13924 +}
13925 +
13926 +EXPORT_SYMBOL(do_settimeofday);
13927 +
13928 +static void sync_xen_wallclock(unsigned long dummy);
13929 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
13930 +static void sync_xen_wallclock(unsigned long dummy)
13931 +{
13932 +       time_t sec;
13933 +       s64 nsec;
13934 +       dom0_op_t op;
13935 +
13936 +       if (!ntp_synced() || independent_wallclock ||
13937 +           !(xen_start_info->flags & SIF_INITDOMAIN))
13938 +               return;
13939 +
13940 +       write_seqlock_irq(&xtime_lock);
13941 +
13942 +       sec  = xtime.tv_sec;
13943 +       nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
13944 +       __normalize_time(&sec, &nsec);
13945 +
13946 +       op.cmd = DOM0_SETTIME;
13947 +       op.u.settime.secs        = sec;
13948 +       op.u.settime.nsecs       = nsec;
13949 +       op.u.settime.system_time = processed_system_time;
13950 +       HYPERVISOR_dom0_op(&op);
13951 +
13952 +       update_wallclock();
13953 +
13954 +       write_sequnlock_irq(&xtime_lock);
13955 +
13956 +       /* Once per minute. */
13957 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
13958 +}
13959 +
13960 +static int set_rtc_mmss(unsigned long nowtime)
13961 +{
13962 +       int retval;
13963 +
13964 +       WARN_ON(irqs_disabled());
13965 +
13966 +       if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
13967 +               return 0;
13968 +
13969 +       /* gets recalled with irq locally disabled */
13970 +       spin_lock_irq(&rtc_lock);
13971 +       if (efi_enabled)
13972 +               retval = efi_set_rtc_mmss(nowtime);
13973 +       else
13974 +               retval = mach_set_rtc_mmss(nowtime);
13975 +       spin_unlock_irq(&rtc_lock);
13976 +
13977 +       return retval;
13978 +}
13979 +
13980 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
13981 + *             Note: This function is required to return accurate
13982 + *             time even in the absence of multiple timer ticks.
13983 + */
13984 +unsigned long long monotonic_clock(void)
13985 +{
13986 +       int cpu = get_cpu();
13987 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
13988 +       u64 time;
13989 +       u32 local_time_version;
13990 +
13991 +       do {
13992 +               local_time_version = shadow->version;
13993 +               barrier();
13994 +               time = shadow->system_timestamp + get_nsec_offset(shadow);
13995 +               if (!time_values_up_to_date(cpu))
13996 +                       get_time_values_from_xen();
13997 +               barrier();
13998 +       } while (local_time_version != shadow->version);
13999 +
14000 +       put_cpu();
14001 +
14002 +       return time;
14003 +}
14004 +EXPORT_SYMBOL(monotonic_clock);
14005 +
14006 +unsigned long long sched_clock(void)
14007 +{
14008 +       return monotonic_clock();
14009 +}
14010 +
14011 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
14012 +unsigned long profile_pc(struct pt_regs *regs)
14013 +{
14014 +       unsigned long pc = instruction_pointer(regs);
14015 +
14016 +#ifdef __x86_64__
14017 +       /* Assume the lock function has either no stack frame or only a single word.
14018 +          This checks if the address on the stack looks like a kernel text address.
14019 +          There is a small window for false hits, but in that case the tick
14020 +          is just accounted to the spinlock function.
14021 +          Better would be to write these functions in assembler again
14022 +          and check exactly. */
14023 +       if (in_lock_functions(pc)) {
14024 +               char *v = *(char **)regs->rsp;
14025 +               if ((v >= _stext && v <= _etext) ||
14026 +                       (v >= _sinittext && v <= _einittext) ||
14027 +                       (v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
14028 +                       return (unsigned long)v;
14029 +               return ((unsigned long *)regs->rsp)[1];
14030 +       }
14031 +#else
14032 +       if (in_lock_functions(pc))
14033 +               return *(unsigned long *)(regs->ebp + 4);
14034 +#endif
14035 +
14036 +       return pc;
14037 +}
14038 +EXPORT_SYMBOL(profile_pc);
14039 +#endif
14040 +
14041 +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
14042 +{
14043 +       s64 delta, delta_cpu, stolen, blocked;
14044 +       u64 sched_time;
14045 +       int i, cpu = smp_processor_id();
14046 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14047 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14048 +
14049 +       write_seqlock(&xtime_lock);
14050 +
14051 +       do {
14052 +               get_time_values_from_xen();
14053 +
14054 +               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
14055 +               delta = delta_cpu =
14056 +                       shadow->system_timestamp + get_nsec_offset(shadow);
14057 +               delta     -= processed_system_time;
14058 +               delta_cpu -= per_cpu(processed_system_time, cpu);
14059 +
14060 +               /*
14061 +                * Obtain a consistent snapshot of stolen/blocked cycles. We
14062 +                * can use state_entry_time to detect if we get preempted here.
14063 +                */
14064 +               do {
14065 +                       sched_time = runstate->state_entry_time;
14066 +                       barrier();
14067 +                       stolen = runstate->time[RUNSTATE_runnable] +
14068 +                               runstate->time[RUNSTATE_offline] -
14069 +                               per_cpu(processed_stolen_time, cpu);
14070 +                       blocked = runstate->time[RUNSTATE_blocked] -
14071 +                               per_cpu(processed_blocked_time, cpu);
14072 +                       barrier();
14073 +               } while (sched_time != runstate->state_entry_time);
14074 +       } while (!time_values_up_to_date(cpu));
14075 +
14076 +       if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
14077 +            unlikely(delta_cpu < -(s64)permitted_clock_jitter))
14078 +           && printk_ratelimit()) {
14079 +               printk("Timer ISR/%d: Time went backwards: "
14080 +                      "delta=%lld delta_cpu=%lld shadow=%lld "
14081 +                      "off=%lld processed=%lld cpu_processed=%lld\n",
14082 +                      cpu, delta, delta_cpu, shadow->system_timestamp,
14083 +                      (s64)get_nsec_offset(shadow),
14084 +                      processed_system_time,
14085 +                      per_cpu(processed_system_time, cpu));
14086 +               for (i = 0; i < num_online_cpus(); i++)
14087 +                       printk(" %d: %lld\n", i,
14088 +                              per_cpu(processed_system_time, i));
14089 +       }
14090 +
14091 +       /* System-wide jiffy work. */
14092 +       while (delta >= NS_PER_TICK) {
14093 +               delta -= NS_PER_TICK;
14094 +               processed_system_time += NS_PER_TICK;
14095 +               do_timer(regs);
14096 +       }
14097 +
14098 +       if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
14099 +               update_wallclock();
14100 +               clock_was_set();
14101 +       }
14102 +
14103 +       write_sequnlock(&xtime_lock);
14104 +
14105 +       /*
14106 +        * Account stolen ticks.
14107 +        * HACK: Passing NULL to account_steal_time()
14108 +        * ensures that the ticks are accounted as stolen.
14109 +        */
14110 +       if ((stolen > 0) && (delta_cpu > 0)) {
14111 +               delta_cpu -= stolen;
14112 +               if (unlikely(delta_cpu < 0))
14113 +                       stolen += delta_cpu; /* clamp local-time progress */
14114 +               do_div(stolen, NS_PER_TICK);
14115 +               per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
14116 +               per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
14117 +               account_steal_time(NULL, (cputime_t)stolen);
14118 +       }
14119 +
14120 +       /*
14121 +        * Account blocked ticks.
14122 +        * HACK: Passing idle_task to account_steal_time()
14123 +        * ensures that the ticks are accounted as idle/wait.
14124 +        */
14125 +       if ((blocked > 0) && (delta_cpu > 0)) {
14126 +               delta_cpu -= blocked;
14127 +               if (unlikely(delta_cpu < 0))
14128 +                       blocked += delta_cpu; /* clamp local-time progress */
14129 +               do_div(blocked, NS_PER_TICK);
14130 +               per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
14131 +               per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
14132 +               account_steal_time(idle_task(cpu), (cputime_t)blocked);
14133 +       }
14134 +
14135 +       /* Account user/system ticks. */
14136 +       if (delta_cpu > 0) {
14137 +               do_div(delta_cpu, NS_PER_TICK);
14138 +               per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
14139 +               if (user_mode(regs))
14140 +                       account_user_time(current, (cputime_t)delta_cpu);
14141 +               else
14142 +                       account_system_time(current, HARDIRQ_OFFSET,
14143 +                                           (cputime_t)delta_cpu);
14144 +       }
14145 +
14146 +       /* Local timer processing (see update_process_times()). */
14147 +       run_local_timers();
14148 +       if (rcu_pending(cpu))
14149 +               rcu_check_callbacks(cpu, user_mode(regs));
14150 +       scheduler_tick();
14151 +       run_posix_cpu_timers(current);
14152 +
14153 +       return IRQ_HANDLED;
14154 +}
14155 +
14156 +static void init_missing_ticks_accounting(int cpu)
14157 +{
14158 +       struct vcpu_register_runstate_memory_area area;
14159 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14160 +
14161 +       memset(runstate, 0, sizeof(*runstate));
14162 +
14163 +       area.addr.v = runstate;
14164 +       HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
14165 +
14166 +       per_cpu(processed_blocked_time, cpu) =
14167 +               runstate->time[RUNSTATE_blocked];
14168 +       per_cpu(processed_stolen_time, cpu) =
14169 +               runstate->time[RUNSTATE_runnable] +
14170 +               runstate->time[RUNSTATE_offline];
14171 +}
14172 +
14173 +/* not static: needed by APM */
14174 +unsigned long get_cmos_time(void)
14175 +{
14176 +       unsigned long retval;
14177 +
14178 +       spin_lock(&rtc_lock);
14179 +
14180 +       if (efi_enabled)
14181 +               retval = efi_get_time();
14182 +       else
14183 +               retval = mach_get_cmos_time();
14184 +
14185 +       spin_unlock(&rtc_lock);
14186 +
14187 +       return retval;
14188 +}
14189 +EXPORT_SYMBOL(get_cmos_time);
14190 +
14191 +static void sync_cmos_clock(unsigned long dummy);
14192 +
14193 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
14194 +
14195 +static void sync_cmos_clock(unsigned long dummy)
14196 +{
14197 +       struct timeval now, next;
14198 +       int fail = 1;
14199 +
14200 +       /*
14201 +        * If we have an externally synchronized Linux clock, then update
14202 +        * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
14203 +        * called as close as possible to 500 ms before the new second starts.
14204 +        * This code is run on a timer.  If the clock is set, that timer
14205 +        * may not expire at the correct time.  Thus, we adjust...
14206 +        */
14207 +       if (!ntp_synced())
14208 +               /*
14209 +                * Not synced, exit, do not restart a timer (if one is
14210 +                * running, let it run out).
14211 +                */
14212 +               return;
14213 +
14214 +       do_gettimeofday(&now);
14215 +       if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
14216 +           now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
14217 +               fail = set_rtc_mmss(now.tv_sec);
14218 +
14219 +       next.tv_usec = USEC_AFTER - now.tv_usec;
14220 +       if (next.tv_usec <= 0)
14221 +               next.tv_usec += USEC_PER_SEC;
14222 +
14223 +       if (!fail)
14224 +               next.tv_sec = 659;
14225 +       else
14226 +               next.tv_sec = 0;
14227 +
14228 +       if (next.tv_usec >= USEC_PER_SEC) {
14229 +               next.tv_sec++;
14230 +               next.tv_usec -= USEC_PER_SEC;
14231 +       }
14232 +       mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
14233 +}
14234 +
14235 +void notify_arch_cmos_timer(void)
14236 +{
14237 +       mod_timer(&sync_cmos_timer, jiffies + 1);
14238 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
14239 +}
14240 +
14241 +static long clock_cmos_diff, sleep_start;
14242 +
14243 +static struct timer_opts *last_timer;
14244 +static int timer_suspend(struct sys_device *dev, pm_message_t state)
14245 +{
14246 +       /*
14247 +        * Estimate time zone so that set_time can update the clock
14248 +        */
14249 +       clock_cmos_diff = -get_cmos_time();
14250 +       clock_cmos_diff += get_seconds();
14251 +       sleep_start = get_cmos_time();
14252 +       last_timer = cur_timer;
14253 +       cur_timer = &timer_none;
14254 +       if (last_timer->suspend)
14255 +               last_timer->suspend(state);
14256 +       return 0;
14257 +}
14258 +
14259 +static int timer_resume(struct sys_device *dev)
14260 +{
14261 +       unsigned long flags;
14262 +       unsigned long sec;
14263 +       unsigned long sleep_length;
14264 +
14265 +#ifdef CONFIG_HPET_TIMER
14266 +       if (is_hpet_enabled())
14267 +               hpet_reenable();
14268 +#endif
14269 +       sec = get_cmos_time() + clock_cmos_diff;
14270 +       sleep_length = (get_cmos_time() - sleep_start) * HZ;
14271 +       write_seqlock_irqsave(&xtime_lock, flags);
14272 +       xtime.tv_sec = sec;
14273 +       xtime.tv_nsec = 0;
14274 +       jiffies_64 += sleep_length;
14275 +       wall_jiffies += sleep_length;
14276 +       write_sequnlock_irqrestore(&xtime_lock, flags);
14277 +       if (last_timer->resume)
14278 +               last_timer->resume();
14279 +       cur_timer = last_timer;
14280 +       last_timer = NULL;
14281 +       touch_softlockup_watchdog();
14282 +       return 0;
14283 +}
14284 +
14285 +static struct sysdev_class timer_sysclass = {
14286 +       .resume = timer_resume,
14287 +       .suspend = timer_suspend,
14288 +       set_kset_name("timer"),
14289 +};
14290 +
14291 +
14292 +/* XXX this driverfs stuff should probably go elsewhere later -john */
14293 +static struct sys_device device_timer = {
14294 +       .id     = 0,
14295 +       .cls    = &timer_sysclass,
14296 +};
14297 +
14298 +static int time_init_device(void)
14299 +{
14300 +       int error = sysdev_class_register(&timer_sysclass);
14301 +       if (!error)
14302 +               error = sysdev_register(&device_timer);
14303 +       return error;
14304 +}
14305 +
14306 +device_initcall(time_init_device);
14307 +
14308 +#ifdef CONFIG_HPET_TIMER
14309 +extern void (*late_time_init)(void);
14310 +/* Duplicate of time_init() below, with hpet_enable part added */
14311 +static void __init hpet_time_init(void)
14312 +{
14313 +       xtime.tv_sec = get_cmos_time();
14314 +       xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
14315 +       set_normalized_timespec(&wall_to_monotonic,
14316 +               -xtime.tv_sec, -xtime.tv_nsec);
14317 +
14318 +       if ((hpet_enable() >= 0) && hpet_use_timer) {
14319 +               printk("Using HPET for base-timer\n");
14320 +       }
14321 +
14322 +       cur_timer = select_timer();
14323 +       printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
14324 +
14325 +       time_init_hook();
14326 +}
14327 +#endif
14328 +
14329 +/* Dynamically-mapped IRQ. */
14330 +DEFINE_PER_CPU(int, timer_irq);
14331 +
14332 +extern void (*late_time_init)(void);
14333 +static void setup_cpu0_timer_irq(void)
14334 +{
14335 +       per_cpu(timer_irq, 0) =
14336 +               bind_virq_to_irqhandler(
14337 +                       VIRQ_TIMER,
14338 +                       0,
14339 +                       timer_interrupt,
14340 +                       SA_INTERRUPT,
14341 +                       "timer0",
14342 +                       NULL);
14343 +       BUG_ON(per_cpu(timer_irq, 0) < 0);
14344 +}
14345 +
14346 +void __init time_init(void)
14347 +{
14348 +#ifdef CONFIG_HPET_TIMER
14349 +       if (is_hpet_capable()) {
14350 +               /*
14351 +                * HPET initialization needs to do memory-mapped io. So, let
14352 +                * us do a late initialization after mem_init().
14353 +                */
14354 +               late_time_init = hpet_time_init;
14355 +               return;
14356 +       }
14357 +#endif
14358 +       get_time_values_from_xen();
14359 +
14360 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
14361 +       per_cpu(processed_system_time, 0) = processed_system_time;
14362 +       init_missing_ticks_accounting(0);
14363 +
14364 +       update_wallclock();
14365 +
14366 +       init_cpu_khz();
14367 +       printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
14368 +              cpu_khz / 1000, cpu_khz % 1000);
14369 +
14370 +#if defined(__x86_64__)
14371 +       vxtime.mode = VXTIME_TSC;
14372 +       vxtime.quot = (1000000L << 32) / vxtime_hz;
14373 +       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
14374 +       sync_core();
14375 +       rdtscll(vxtime.last_tsc);
14376 +#endif
14377 +
14378 +       /* Cannot request_irq() until kmem is initialised. */
14379 +       late_time_init = setup_cpu0_timer_irq;
14380 +}
14381 +
14382 +/* Convert jiffies to system time. */
14383 +u64 jiffies_to_st(unsigned long j)
14384 +{
14385 +       unsigned long seq;
14386 +       long delta;
14387 +       u64 st;
14388 +
14389 +       do {
14390 +               seq = read_seqbegin(&xtime_lock);
14391 +               delta = j - jiffies;
14392 +               /* NB. The next check can trigger in some wrap-around cases,
14393 +                * but that's ok: we'll just end up with a shorter timeout. */
14394 +               if (delta < 1)
14395 +                       delta = 1;
14396 +               st = processed_system_time + (delta * (u64)NS_PER_TICK);
14397 +       } while (read_seqretry(&xtime_lock, seq));
14398 +
14399 +       return st;
14400 +}
14401 +EXPORT_SYMBOL(jiffies_to_st);
14402 +
14403 +/*
14404 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
14405 + * These functions are based on implementations from arch/s390/kernel/time.c
14406 + */
14407 +void stop_hz_timer(void)
14408 +{
14409 +       unsigned int cpu = smp_processor_id();
14410 +       unsigned long j;
14411 +
14412 +       /* We must do this /before/ checking rcu_pending(). */
14413 +       cpu_set(cpu, nohz_cpu_mask);
14414 +       smp_mb();
14415 +
14416 +       /* Leave ourselves in 'tick mode' if rcu or softirq pending. */
14417 +       if (rcu_pending(cpu) || local_softirq_pending()) {
14418 +               cpu_clear(cpu, nohz_cpu_mask);
14419 +               j = jiffies + 1;
14420 +       } else {
14421 +               j = next_timer_interrupt();
14422 +       }
14423 +
14424 +       BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
14425 +}
14426 +
14427 +void start_hz_timer(void)
14428 +{
14429 +       cpu_clear(smp_processor_id(), nohz_cpu_mask);
14430 +}
14431 +
14432 +/* No locking required. We are only CPU running, and interrupts are off. */
14433 +void time_resume(void)
14434 +{
14435 +       init_cpu_khz();
14436 +
14437 +       get_time_values_from_xen();
14438 +
14439 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
14440 +       per_cpu(processed_system_time, 0) = processed_system_time;
14441 +       init_missing_ticks_accounting(0);
14442 +
14443 +       update_wallclock();
14444 +}
14445 +
14446 +#ifdef CONFIG_SMP
14447 +static char timer_name[NR_CPUS][15];
14448 +
14449 +void local_setup_timer(unsigned int cpu)
14450 +{
14451 +       int seq;
14452 +
14453 +       BUG_ON(cpu == 0);
14454 +
14455 +       do {
14456 +               seq = read_seqbegin(&xtime_lock);
14457 +               /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
14458 +               per_cpu(processed_system_time, cpu) =
14459 +                       per_cpu(shadow_time, 0).system_timestamp;
14460 +               init_missing_ticks_accounting(cpu);
14461 +       } while (read_seqretry(&xtime_lock, seq));
14462 +
14463 +       sprintf(timer_name[cpu], "timer%d", cpu);
14464 +       per_cpu(timer_irq, cpu) =
14465 +               bind_virq_to_irqhandler(
14466 +                       VIRQ_TIMER,
14467 +                       cpu,
14468 +                       timer_interrupt,
14469 +                       SA_INTERRUPT,
14470 +                       timer_name[cpu],
14471 +                       NULL);
14472 +       BUG_ON(per_cpu(timer_irq, cpu) < 0);
14473 +}
14474 +
14475 +void local_teardown_timer(unsigned int cpu)
14476 +{
14477 +       BUG_ON(cpu == 0);
14478 +       unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
14479 +}
14480 +#endif
14481 +
14482 +/*
14483 + * /proc/sys/xen: This really belongs in another file. It can stay here for
14484 + * now however.
14485 + */
14486 +static ctl_table xen_subtable[] = {
14487 +       {
14488 +               .ctl_name       = 1,
14489 +               .procname       = "independent_wallclock",
14490 +               .data           = &independent_wallclock,
14491 +               .maxlen         = sizeof(independent_wallclock),
14492 +               .mode           = 0644,
14493 +               .proc_handler   = proc_dointvec
14494 +       },
14495 +       {
14496 +               .ctl_name       = 2,
14497 +               .procname       = "permitted_clock_jitter",
14498 +               .data           = &permitted_clock_jitter,
14499 +               .maxlen         = sizeof(permitted_clock_jitter),
14500 +               .mode           = 0644,
14501 +               .proc_handler   = proc_doulongvec_minmax
14502 +       },
14503 +       { 0 }
14504 +};
14505 +static ctl_table xen_table[] = {
14506 +       {
14507 +               .ctl_name       = 123,
14508 +               .procname       = "xen",
14509 +               .mode           = 0555,
14510 +               .child          = xen_subtable},
14511 +       { 0 }
14512 +};
14513 +static int __init xen_sysctl_init(void)
14514 +{
14515 +       (void)register_sysctl_table(xen_table, 0);
14516 +       return 0;
14517 +}
14518 +__initcall(xen_sysctl_init);
14519 +
14520 +/*
14521 + * Local variables:
14522 + *  c-file-style: "linux"
14523 + *  indent-tabs-mode: t
14524 + *  c-indent-level: 8
14525 + *  c-basic-offset: 8
14526 + *  tab-width: 8
14527 + * End:
14528 + */
14529 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/traps.c tmp-linux-2.6-xen.patch/arch/i386/kernel/traps.c
14530 --- ref-linux-2.6.16.9/arch/i386/kernel/traps.c 2006-04-19 08:10:14.000000000 +0200
14531 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/traps.c    2006-04-10 00:05:52.000000000 +0200
14532 @@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
14533  
14534  static void io_check_error(unsigned char reason, struct pt_regs * regs)
14535  {
14536 -       unsigned long i;
14537 -
14538         printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
14539         show_registers(regs);
14540  
14541         /* Re-enable the IOCK line, wait for a few seconds */
14542 -       reason = (reason & 0xf) | 8;
14543 -       outb(reason, 0x61);
14544 -       i = 2000;
14545 -       while (--i) udelay(1000);
14546 -       reason &= ~8;
14547 -       outb(reason, 0x61);
14548 +       clear_io_check_error(reason);
14549  }
14550  
14551  static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
14552 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/traps-xen.c tmp-linux-2.6-xen.patch/arch/i386/kernel/traps-xen.c
14553 --- ref-linux-2.6.16.9/arch/i386/kernel/traps-xen.c     1970-01-01 01:00:00.000000000 +0100
14554 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/traps-xen.c        2006-04-10 00:05:52.000000000 +0200
14555 @@ -0,0 +1,1094 @@
14556 +/*
14557 + *  linux/arch/i386/traps.c
14558 + *
14559 + *  Copyright (C) 1991, 1992  Linus Torvalds
14560 + *
14561 + *  Pentium III FXSR, SSE support
14562 + *     Gareth Hughes <gareth@valinux.com>, May 2000
14563 + */
14564 +
14565 +/*
14566 + * 'Traps.c' handles hardware traps and faults after we have saved some
14567 + * state in 'asm.s'.
14568 + */
14569 +#include <linux/config.h>
14570 +#include <linux/sched.h>
14571 +#include <linux/kernel.h>
14572 +#include <linux/string.h>
14573 +#include <linux/errno.h>
14574 +#include <linux/timer.h>
14575 +#include <linux/mm.h>
14576 +#include <linux/init.h>
14577 +#include <linux/delay.h>
14578 +#include <linux/spinlock.h>
14579 +#include <linux/interrupt.h>
14580 +#include <linux/highmem.h>
14581 +#include <linux/kallsyms.h>
14582 +#include <linux/ptrace.h>
14583 +#include <linux/utsname.h>
14584 +#include <linux/kprobes.h>
14585 +#include <linux/kexec.h>
14586 +
14587 +#ifdef CONFIG_EISA
14588 +#include <linux/ioport.h>
14589 +#include <linux/eisa.h>
14590 +#endif
14591 +
14592 +#ifdef CONFIG_MCA
14593 +#include <linux/mca.h>
14594 +#endif
14595 +
14596 +#include <asm/processor.h>
14597 +#include <asm/system.h>
14598 +#include <asm/uaccess.h>
14599 +#include <asm/io.h>
14600 +#include <asm/atomic.h>
14601 +#include <asm/debugreg.h>
14602 +#include <asm/desc.h>
14603 +#include <asm/i387.h>
14604 +#include <asm/nmi.h>
14605 +
14606 +#include <asm/smp.h>
14607 +#include <asm/arch_hooks.h>
14608 +#include <asm/kdebug.h>
14609 +
14610 +#include <linux/module.h>
14611 +
14612 +#include "mach_traps.h"
14613 +
14614 +asmlinkage int system_call(void);
14615 +
14616 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
14617 +               { 0, 0 }, { 0, 0 } };
14618 +
14619 +/* Do we ignore FPU interrupts ? */
14620 +char ignore_fpu_irq = 0;
14621 +
14622 +#ifndef CONFIG_X86_NO_IDT
14623 +/*
14624 + * The IDT has to be page-aligned to simplify the Pentium
14625 + * F0 0F bug workaround.. We have a special link segment
14626 + * for this.
14627 + */
14628 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
14629 +#endif
14630 +
14631 +asmlinkage void divide_error(void);
14632 +asmlinkage void debug(void);
14633 +asmlinkage void nmi(void);
14634 +asmlinkage void int3(void);
14635 +asmlinkage void overflow(void);
14636 +asmlinkage void bounds(void);
14637 +asmlinkage void invalid_op(void);
14638 +asmlinkage void device_not_available(void);
14639 +asmlinkage void coprocessor_segment_overrun(void);
14640 +asmlinkage void invalid_TSS(void);
14641 +asmlinkage void segment_not_present(void);
14642 +asmlinkage void stack_segment(void);
14643 +asmlinkage void general_protection(void);
14644 +asmlinkage void page_fault(void);
14645 +asmlinkage void coprocessor_error(void);
14646 +asmlinkage void simd_coprocessor_error(void);
14647 +asmlinkage void alignment_check(void);
14648 +#ifndef CONFIG_XEN
14649 +asmlinkage void spurious_interrupt_bug(void);
14650 +#else
14651 +asmlinkage void fixup_4gb_segment(void);
14652 +#endif
14653 +asmlinkage void machine_check(void);
14654 +
14655 +static int kstack_depth_to_print = 24;
14656 +struct notifier_block *i386die_chain;
14657 +static DEFINE_SPINLOCK(die_notifier_lock);
14658 +
14659 +int register_die_notifier(struct notifier_block *nb)
14660 +{
14661 +       int err = 0;
14662 +       unsigned long flags;
14663 +       spin_lock_irqsave(&die_notifier_lock, flags);
14664 +       err = notifier_chain_register(&i386die_chain, nb);
14665 +       spin_unlock_irqrestore(&die_notifier_lock, flags);
14666 +       return err;
14667 +}
14668 +EXPORT_SYMBOL(register_die_notifier);
14669 +
14670 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
14671 +{
14672 +       return  p > (void *)tinfo &&
14673 +               p < (void *)tinfo + THREAD_SIZE - 3;
14674 +}
14675 +
14676 +static void print_addr_and_symbol(unsigned long addr, char *log_lvl)
14677 +{
14678 +       printk(log_lvl);
14679 +       printk(" [<%08lx>] ", addr);
14680 +       print_symbol("%s", addr);
14681 +       printk("\n");
14682 +}
14683 +
14684 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
14685 +                               unsigned long *stack, unsigned long ebp,
14686 +                               char *log_lvl)
14687 +{
14688 +       unsigned long addr;
14689 +
14690 +#ifdef CONFIG_FRAME_POINTER
14691 +       while (valid_stack_ptr(tinfo, (void *)ebp)) {
14692 +               addr = *(unsigned long *)(ebp + 4);
14693 +               print_addr_and_symbol(addr, log_lvl);
14694 +               ebp = *(unsigned long *)ebp;
14695 +       }
14696 +#else
14697 +       while (valid_stack_ptr(tinfo, stack)) {
14698 +               addr = *stack++;
14699 +               if (__kernel_text_address(addr))
14700 +                       print_addr_and_symbol(addr, log_lvl);
14701 +       }
14702 +#endif
14703 +       return ebp;
14704 +}
14705 +
14706 +static void show_trace_log_lvl(struct task_struct *task,
14707 +                              unsigned long *stack, char *log_lvl)
14708 +{
14709 +       unsigned long ebp;
14710 +
14711 +       if (!task)
14712 +               task = current;
14713 +
14714 +       if (task == current) {
14715 +               /* Grab ebp right from our regs */
14716 +               asm ("movl %%ebp, %0" : "=r" (ebp) : );
14717 +       } else {
14718 +               /* ebp is the last reg pushed by switch_to */
14719 +               ebp = *(unsigned long *) task->thread.esp;
14720 +       }
14721 +
14722 +       while (1) {
14723 +               struct thread_info *context;
14724 +               context = (struct thread_info *)
14725 +                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
14726 +               ebp = print_context_stack(context, stack, ebp, log_lvl);
14727 +               stack = (unsigned long*)context->previous_esp;
14728 +               if (!stack)
14729 +                       break;
14730 +               printk(log_lvl);
14731 +               printk(" =======================\n");
14732 +       }
14733 +}
14734 +
14735 +void show_trace(struct task_struct *task, unsigned long * stack)
14736 +{
14737 +       show_trace_log_lvl(task, stack, "");
14738 +}
14739 +
14740 +static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
14741 +                              char *log_lvl)
14742 +{
14743 +       unsigned long *stack;
14744 +       int i;
14745 +
14746 +       if (esp == NULL) {
14747 +               if (task)
14748 +                       esp = (unsigned long*)task->thread.esp;
14749 +               else
14750 +                       esp = (unsigned long *)&esp;
14751 +       }
14752 +
14753 +       stack = esp;
14754 +       printk(log_lvl);
14755 +       for(i = 0; i < kstack_depth_to_print; i++) {
14756 +               if (kstack_end(stack))
14757 +                       break;
14758 +               if (i && ((i % 8) == 0)) {
14759 +                       printk("\n");
14760 +                       printk(log_lvl);
14761 +                       printk("       ");
14762 +               }
14763 +               printk("%08lx ", *stack++);
14764 +       }
14765 +       printk("\n");
14766 +       printk(log_lvl);
14767 +       printk("Call Trace:\n");
14768 +       show_trace_log_lvl(task, esp, log_lvl);
14769 +}
14770 +
14771 +void show_stack(struct task_struct *task, unsigned long *esp)
14772 +{
14773 +       show_stack_log_lvl(task, esp, "");
14774 +}
14775 +
14776 +/*
14777 + * The architecture-independent dump_stack generator
14778 + */
14779 +void dump_stack(void)
14780 +{
14781 +       unsigned long stack;
14782 +
14783 +       show_trace(current, &stack);
14784 +}
14785 +
14786 +EXPORT_SYMBOL(dump_stack);
14787 +
14788 +void show_registers(struct pt_regs *regs)
14789 +{
14790 +       int i;
14791 +       int in_kernel = 1;
14792 +       unsigned long esp;
14793 +       unsigned short ss;
14794 +
14795 +       esp = (unsigned long) (&regs->esp);
14796 +       savesegment(ss, ss);
14797 +       if (user_mode(regs)) {
14798 +               in_kernel = 0;
14799 +               esp = regs->esp;
14800 +               ss = regs->xss & 0xffff;
14801 +       }
14802 +       print_modules();
14803 +       printk(KERN_EMERG "CPU:    %d\nEIP:    %04x:[<%08lx>]    %s VLI\n"
14804 +                       "EFLAGS: %08lx   (%s %.*s) \n",
14805 +               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
14806 +               print_tainted(), regs->eflags, system_utsname.release,
14807 +               (int)strcspn(system_utsname.version, " "),
14808 +               system_utsname.version);
14809 +       print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
14810 +       printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
14811 +               regs->eax, regs->ebx, regs->ecx, regs->edx);
14812 +       printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
14813 +               regs->esi, regs->edi, regs->ebp, esp);
14814 +       printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
14815 +               regs->xds & 0xffff, regs->xes & 0xffff, ss);
14816 +       printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)",
14817 +               current->comm, current->pid, current_thread_info(), current);
14818 +       /*
14819 +        * When in-kernel, we also print out the stack and code at the
14820 +        * time of the fault..
14821 +        */
14822 +       if (in_kernel) {
14823 +               u8 __user *eip;
14824 +
14825 +               printk("\n" KERN_EMERG "Stack: ");
14826 +               show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG);
14827 +
14828 +               printk(KERN_EMERG "Code: ");
14829 +
14830 +               eip = (u8 __user *)regs->eip - 43;
14831 +               for (i = 0; i < 64; i++, eip++) {
14832 +                       unsigned char c;
14833 +
14834 +                       if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
14835 +                               printk(" Bad EIP value.");
14836 +                               break;
14837 +                       }
14838 +                       if (eip == (u8 __user *)regs->eip)
14839 +                               printk("<%02x> ", c);
14840 +                       else
14841 +                               printk("%02x ", c);
14842 +               }
14843 +       }
14844 +       printk("\n");
14845 +}      
14846 +
14847 +static void handle_BUG(struct pt_regs *regs)
14848 +{
14849 +       unsigned short ud2;
14850 +       unsigned short line;
14851 +       char *file;
14852 +       char c;
14853 +       unsigned long eip;
14854 +
14855 +       eip = regs->eip;
14856 +
14857 +       if (eip < PAGE_OFFSET)
14858 +               goto no_bug;
14859 +       if (__get_user(ud2, (unsigned short __user *)eip))
14860 +               goto no_bug;
14861 +       if (ud2 != 0x0b0f)
14862 +               goto no_bug;
14863 +       if (__get_user(line, (unsigned short __user *)(eip + 2)))
14864 +               goto bug;
14865 +       if (__get_user(file, (char * __user *)(eip + 4)) ||
14866 +               (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
14867 +               file = "<bad filename>";
14868 +
14869 +       printk(KERN_EMERG "------------[ cut here ]------------\n");
14870 +       printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
14871 +
14872 +no_bug:
14873 +       return;
14874 +
14875 +       /* Here we know it was a BUG but file-n-line is unavailable */
14876 +bug:
14877 +       printk(KERN_EMERG "Kernel BUG\n");
14878 +}
14879 +
14880 +/* This is gone through when something in the kernel
14881 + * has done something bad and is about to be terminated.
14882 +*/
14883 +void die(const char * str, struct pt_regs * regs, long err)
14884 +{
14885 +       static struct {
14886 +               spinlock_t lock;
14887 +               u32 lock_owner;
14888 +               int lock_owner_depth;
14889 +       } die = {
14890 +               .lock =                 SPIN_LOCK_UNLOCKED,
14891 +               .lock_owner =           -1,
14892 +               .lock_owner_depth =     0
14893 +       };
14894 +       static int die_counter;
14895 +       unsigned long flags;
14896 +
14897 +       if (die.lock_owner != raw_smp_processor_id()) {
14898 +               console_verbose();
14899 +               spin_lock_irqsave(&die.lock, flags);
14900 +               die.lock_owner = smp_processor_id();
14901 +               die.lock_owner_depth = 0;
14902 +               bust_spinlocks(1);
14903 +       }
14904 +       else
14905 +               local_save_flags(flags);
14906 +
14907 +       if (++die.lock_owner_depth < 3) {
14908 +               int nl = 0;
14909 +               handle_BUG(regs);
14910 +               printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
14911 +#ifdef CONFIG_PREEMPT
14912 +               printk(KERN_EMERG "PREEMPT ");
14913 +               nl = 1;
14914 +#endif
14915 +#ifdef CONFIG_SMP
14916 +               if (!nl)
14917 +                       printk(KERN_EMERG);
14918 +               printk("SMP ");
14919 +               nl = 1;
14920 +#endif
14921 +#ifdef CONFIG_DEBUG_PAGEALLOC
14922 +               if (!nl)
14923 +                       printk(KERN_EMERG);
14924 +               printk("DEBUG_PAGEALLOC");
14925 +               nl = 1;
14926 +#endif
14927 +               if (nl)
14928 +                       printk("\n");
14929 +       notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
14930 +               show_registers(regs);
14931 +       } else
14932 +               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
14933 +
14934 +       bust_spinlocks(0);
14935 +       die.lock_owner = -1;
14936 +       spin_unlock_irqrestore(&die.lock, flags);
14937 +
14938 +       if (kexec_should_crash(current))
14939 +               crash_kexec(regs);
14940 +
14941 +       if (in_interrupt())
14942 +               panic("Fatal exception in interrupt");
14943 +
14944 +       if (panic_on_oops) {
14945 +               printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
14946 +               ssleep(5);
14947 +               panic("Fatal exception");
14948 +       }
14949 +       do_exit(SIGSEGV);
14950 +}
14951 +
14952 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
14953 +{
14954 +       if (!user_mode_vm(regs))
14955 +               die(str, regs, err);
14956 +}
14957 +
14958 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
14959 +                             struct pt_regs * regs, long error_code,
14960 +                             siginfo_t *info)
14961 +{
14962 +       struct task_struct *tsk = current;
14963 +       tsk->thread.error_code = error_code;
14964 +       tsk->thread.trap_no = trapnr;
14965 +
14966 +       if (regs->eflags & VM_MASK) {
14967 +               if (vm86)
14968 +                       goto vm86_trap;
14969 +               goto trap_signal;
14970 +       }
14971 +
14972 +       if (!user_mode(regs))
14973 +               goto kernel_trap;
14974 +
14975 +       trap_signal: {
14976 +               if (info)
14977 +                       force_sig_info(signr, info, tsk);
14978 +               else
14979 +                       force_sig(signr, tsk);
14980 +               return;
14981 +       }
14982 +
14983 +       kernel_trap: {
14984 +               if (!fixup_exception(regs))
14985 +                       die(str, regs, error_code);
14986 +               return;
14987 +       }
14988 +
14989 +       vm86_trap: {
14990 +               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
14991 +               if (ret) goto trap_signal;
14992 +               return;
14993 +       }
14994 +}
14995 +
14996 +#define DO_ERROR(trapnr, signr, str, name) \
14997 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
14998 +{ \
14999 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15000 +                                               == NOTIFY_STOP) \
15001 +               return; \
15002 +       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
15003 +}
15004 +
15005 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15006 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15007 +{ \
15008 +       siginfo_t info; \
15009 +       info.si_signo = signr; \
15010 +       info.si_errno = 0; \
15011 +       info.si_code = sicode; \
15012 +       info.si_addr = (void __user *)siaddr; \
15013 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15014 +                                               == NOTIFY_STOP) \
15015 +               return; \
15016 +       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
15017 +}
15018 +
15019 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
15020 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15021 +{ \
15022 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15023 +                                               == NOTIFY_STOP) \
15024 +               return; \
15025 +       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
15026 +}
15027 +
15028 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15029 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15030 +{ \
15031 +       siginfo_t info; \
15032 +       info.si_signo = signr; \
15033 +       info.si_errno = 0; \
15034 +       info.si_code = sicode; \
15035 +       info.si_addr = (void __user *)siaddr; \
15036 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15037 +                                               == NOTIFY_STOP) \
15038 +               return; \
15039 +       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
15040 +}
15041 +
15042 +DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
15043 +#ifndef CONFIG_KPROBES
15044 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
15045 +#endif
15046 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
15047 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
15048 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
15049 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
15050 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
15051 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
15052 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
15053 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
15054 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
15055 +
15056 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
15057 +                                             long error_code)
15058 +{
15059 +       current->thread.error_code = error_code;
15060 +       current->thread.trap_no = 13;
15061 +
15062 +       if (regs->eflags & VM_MASK)
15063 +               goto gp_in_vm86;
15064 +
15065 +       if (!user_mode(regs))
15066 +               goto gp_in_kernel;
15067 +
15068 +       current->thread.error_code = error_code;
15069 +       current->thread.trap_no = 13;
15070 +       force_sig(SIGSEGV, current);
15071 +       return;
15072 +
15073 +gp_in_vm86:
15074 +       local_irq_enable();
15075 +       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
15076 +       return;
15077 +
15078 +gp_in_kernel:
15079 +       if (!fixup_exception(regs)) {
15080 +               if (notify_die(DIE_GPF, "general protection fault", regs,
15081 +                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
15082 +                       return;
15083 +               die("general protection fault", regs, error_code);
15084 +       }
15085 +}
15086 +
15087 +static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
15088 +{
15089 +       printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
15090 +                       "to continue\n");
15091 +       printk(KERN_EMERG "You probably have a hardware problem with your RAM "
15092 +                       "chips\n");
15093 +
15094 +       /* Clear and disable the memory parity error line. */
15095 +       clear_mem_error(reason);
15096 +}
15097 +
15098 +static void io_check_error(unsigned char reason, struct pt_regs * regs)
15099 +{
15100 +       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
15101 +       show_registers(regs);
15102 +
15103 +       /* Re-enable the IOCK line, wait for a few seconds */
15104 +       clear_io_check_error(reason);
15105 +}
15106 +
15107 +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
15108 +{
15109 +#ifdef CONFIG_MCA
15110 +       /* Might actually be able to figure out what the guilty party
15111 +       * is. */
15112 +       if( MCA_bus ) {
15113 +               mca_handle_nmi();
15114 +               return;
15115 +       }
15116 +#endif
15117 +       printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
15118 +               reason, smp_processor_id());
15119 +       printk("Dazed and confused, but trying to continue\n");
15120 +       printk("Do you have a strange power saving mode enabled?\n");
15121 +}
15122 +
15123 +static DEFINE_SPINLOCK(nmi_print_lock);
15124 +
15125 +void die_nmi (struct pt_regs *regs, const char *msg)
15126 +{
15127 +       if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
15128 +           NOTIFY_STOP)
15129 +               return;
15130 +
15131 +       spin_lock(&nmi_print_lock);
15132 +       /*
15133 +       * We are in trouble anyway, lets at least try
15134 +       * to get a message out.
15135 +       */
15136 +       bust_spinlocks(1);
15137 +       printk(KERN_EMERG "%s", msg);
15138 +       printk(" on CPU%d, eip %08lx, registers:\n",
15139 +               smp_processor_id(), regs->eip);
15140 +       show_registers(regs);
15141 +       printk(KERN_EMERG "console shuts up ...\n");
15142 +       console_silent();
15143 +       spin_unlock(&nmi_print_lock);
15144 +       bust_spinlocks(0);
15145 +
15146 +       /* If we are in kernel we are probably nested up pretty bad
15147 +        * and might aswell get out now while we still can.
15148 +       */
15149 +       if (!user_mode(regs)) {
15150 +               current->thread.trap_no = 2;
15151 +               crash_kexec(regs);
15152 +       }
15153 +
15154 +       do_exit(SIGSEGV);
15155 +}
15156 +
15157 +static void default_do_nmi(struct pt_regs * regs)
15158 +{
15159 +       unsigned char reason = 0;
15160 +
15161 +       /* Only the BSP gets external NMIs from the system.  */
15162 +       if (!smp_processor_id())
15163 +               reason = get_nmi_reason();
15164
15165 +       if (!(reason & 0xc0)) {
15166 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
15167 +                                                       == NOTIFY_STOP)
15168 +                       return;
15169 +#ifdef CONFIG_X86_LOCAL_APIC
15170 +               /*
15171 +                * Ok, so this is none of the documented NMI sources,
15172 +                * so it must be the NMI watchdog.
15173 +                */
15174 +               if (nmi_watchdog) {
15175 +                       nmi_watchdog_tick(regs);
15176 +                       return;
15177 +               }
15178 +#endif
15179 +               unknown_nmi_error(reason, regs);
15180 +               return;
15181 +       }
15182 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
15183 +               return;
15184 +       if (reason & 0x80)
15185 +               mem_parity_error(reason, regs);
15186 +       if (reason & 0x40)
15187 +               io_check_error(reason, regs);
15188 +       /*
15189 +        * Reassert NMI in case it became active meanwhile
15190 +        * as it's edge-triggered.
15191 +        */
15192 +       reassert_nmi();
15193 +}
15194 +
15195 +static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
15196 +{
15197 +       return 0;
15198 +}
15199
15200 +static nmi_callback_t nmi_callback = dummy_nmi_callback;
15201
15202 +fastcall void do_nmi(struct pt_regs * regs, long error_code)
15203 +{
15204 +       int cpu;
15205 +
15206 +       nmi_enter();
15207 +
15208 +       cpu = smp_processor_id();
15209 +
15210 +       ++nmi_count(cpu);
15211 +
15212 +       if (!rcu_dereference(nmi_callback)(regs, cpu))
15213 +               default_do_nmi(regs);
15214 +
15215 +       nmi_exit();
15216 +}
15217 +
15218 +void set_nmi_callback(nmi_callback_t callback)
15219 +{
15220 +       rcu_assign_pointer(nmi_callback, callback);
15221 +}
15222 +EXPORT_SYMBOL_GPL(set_nmi_callback);
15223 +
15224 +void unset_nmi_callback(void)
15225 +{
15226 +       nmi_callback = dummy_nmi_callback;
15227 +}
15228 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
15229 +
15230 +#ifdef CONFIG_KPROBES
15231 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
15232 +{
15233 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
15234 +                       == NOTIFY_STOP)
15235 +               return;
15236 +       /* This is an interrupt gate, because kprobes wants interrupts
15237 +       disabled.  Normal trap handlers don't. */
15238 +       restore_interrupts(regs);
15239 +       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
15240 +}
15241 +#endif
15242 +
15243 +/*
15244 + * Our handling of the processor debug registers is non-trivial.
15245 + * We do not clear them on entry and exit from the kernel. Therefore
15246 + * it is possible to get a watchpoint trap here from inside the kernel.
15247 + * However, the code in ./ptrace.c has ensured that the user can
15248 + * only set watchpoints on userspace addresses. Therefore the in-kernel
15249 + * watchpoint trap can only occur in code which is reading/writing
15250 + * from user space. Such code must not hold kernel locks (since it
15251 + * can equally take a page fault), therefore it is safe to call
15252 + * force_sig_info even though that claims and releases locks.
15253 + * 
15254 + * Code in ./signal.c ensures that the debug control register
15255 + * is restored before we deliver any signal, and therefore that
15256 + * user code runs with the correct debug control register even though
15257 + * we clear it here.
15258 + *
15259 + * Being careful here means that we don't have to be as careful in a
15260 + * lot of more complicated places (task switching can be a bit lazy
15261 + * about restoring all the debug state, and ptrace doesn't have to
15262 + * find every occurrence of the TF bit that could be saved away even
15263 + * by user code)
15264 + */
15265 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
15266 +{
15267 +       unsigned int condition;
15268 +       struct task_struct *tsk = current;
15269 +
15270 +       get_debugreg(condition, 6);
15271 +
15272 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
15273 +                                       SIGTRAP) == NOTIFY_STOP)
15274 +               return;
15275 +       /* It's safe to allow irq's after DR6 has been saved */
15276 +       if (regs->eflags & X86_EFLAGS_IF)
15277 +               local_irq_enable();
15278 +
15279 +       /* Mask out spurious debug traps due to lazy DR7 setting */
15280 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
15281 +               if (!tsk->thread.debugreg[7])
15282 +                       goto clear_dr7;
15283 +       }
15284 +
15285 +       if (regs->eflags & VM_MASK)
15286 +               goto debug_vm86;
15287 +
15288 +       /* Save debug status register where ptrace can see it */
15289 +       tsk->thread.debugreg[6] = condition;
15290 +
15291 +       /*
15292 +        * Single-stepping through TF: make sure we ignore any events in
15293 +        * kernel space (but re-enable TF when returning to user mode).
15294 +        */
15295 +       if (condition & DR_STEP) {
15296 +               /*
15297 +                * We already checked v86 mode above, so we can
15298 +                * check for kernel mode by just checking the CPL
15299 +                * of CS.
15300 +                */
15301 +               if (!user_mode(regs))
15302 +                       goto clear_TF_reenable;
15303 +       }
15304 +
15305 +       /* Ok, finally something we can handle */
15306 +       send_sigtrap(tsk, regs, error_code);
15307 +
15308 +       /* Disable additional traps. They'll be re-enabled when
15309 +        * the signal is delivered.
15310 +        */
15311 +clear_dr7:
15312 +       set_debugreg(0, 7);
15313 +       return;
15314 +
15315 +debug_vm86:
15316 +       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
15317 +       return;
15318 +
15319 +clear_TF_reenable:
15320 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
15321 +       regs->eflags &= ~TF_MASK;
15322 +       return;
15323 +}
15324 +
15325 +/*
15326 + * Note that we play around with the 'TS' bit in an attempt to get
15327 + * the correct behaviour even in the presence of the asynchronous
15328 + * IRQ13 behaviour
15329 + */
15330 +void math_error(void __user *eip)
15331 +{
15332 +       struct task_struct * task;
15333 +       siginfo_t info;
15334 +       unsigned short cwd, swd;
15335 +
15336 +       /*
15337 +        * Save the info for the exception handler and clear the error.
15338 +        */
15339 +       task = current;
15340 +       save_init_fpu(task);
15341 +       task->thread.trap_no = 16;
15342 +       task->thread.error_code = 0;
15343 +       info.si_signo = SIGFPE;
15344 +       info.si_errno = 0;
15345 +       info.si_code = __SI_FAULT;
15346 +       info.si_addr = eip;
15347 +       /*
15348 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
15349 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
15350 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
15351 +        * fault bit.  We should only be taking one exception at a time,
15352 +        * so if this combination doesn't produce any single exception,
15353 +        * then we have a bad program that isn't syncronizing its FPU usage
15354 +        * and it will suffer the consequences since we won't be able to
15355 +        * fully reproduce the context of the exception
15356 +        */
15357 +       cwd = get_fpu_cwd(task);
15358 +       swd = get_fpu_swd(task);
15359 +       switch (swd & ~cwd & 0x3f) {
15360 +               case 0x000: /* No unmasked exception */
15361 +                       return;
15362 +               default:    /* Multiple exceptions */
15363 +                       break;
15364 +               case 0x001: /* Invalid Op */
15365 +                       /*
15366 +                        * swd & 0x240 == 0x040: Stack Underflow
15367 +                        * swd & 0x240 == 0x240: Stack Overflow
15368 +                        * User must clear the SF bit (0x40) if set
15369 +                        */
15370 +                       info.si_code = FPE_FLTINV;
15371 +                       break;
15372 +               case 0x002: /* Denormalize */
15373 +               case 0x010: /* Underflow */
15374 +                       info.si_code = FPE_FLTUND;
15375 +                       break;
15376 +               case 0x004: /* Zero Divide */
15377 +                       info.si_code = FPE_FLTDIV;
15378 +                       break;
15379 +               case 0x008: /* Overflow */
15380 +                       info.si_code = FPE_FLTOVF;
15381 +                       break;
15382 +               case 0x020: /* Precision */
15383 +                       info.si_code = FPE_FLTRES;
15384 +                       break;
15385 +       }
15386 +       force_sig_info(SIGFPE, &info, task);
15387 +}
15388 +
15389 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
15390 +{
15391 +       ignore_fpu_irq = 1;
15392 +       math_error((void __user *)regs->eip);
15393 +}
15394 +
15395 +static void simd_math_error(void __user *eip)
15396 +{
15397 +       struct task_struct * task;
15398 +       siginfo_t info;
15399 +       unsigned short mxcsr;
15400 +
15401 +       /*
15402 +        * Save the info for the exception handler and clear the error.
15403 +        */
15404 +       task = current;
15405 +       save_init_fpu(task);
15406 +       task->thread.trap_no = 19;
15407 +       task->thread.error_code = 0;
15408 +       info.si_signo = SIGFPE;
15409 +       info.si_errno = 0;
15410 +       info.si_code = __SI_FAULT;
15411 +       info.si_addr = eip;
15412 +       /*
15413 +        * The SIMD FPU exceptions are handled a little differently, as there
15414 +        * is only a single status/control register.  Thus, to determine which
15415 +        * unmasked exception was caught we must mask the exception mask bits
15416 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
15417 +        */
15418 +       mxcsr = get_fpu_mxcsr(task);
15419 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
15420 +               case 0x000:
15421 +               default:
15422 +                       break;
15423 +               case 0x001: /* Invalid Op */
15424 +                       info.si_code = FPE_FLTINV;
15425 +                       break;
15426 +               case 0x002: /* Denormalize */
15427 +               case 0x010: /* Underflow */
15428 +                       info.si_code = FPE_FLTUND;
15429 +                       break;
15430 +               case 0x004: /* Zero Divide */
15431 +                       info.si_code = FPE_FLTDIV;
15432 +                       break;
15433 +               case 0x008: /* Overflow */
15434 +                       info.si_code = FPE_FLTOVF;
15435 +                       break;
15436 +               case 0x020: /* Precision */
15437 +                       info.si_code = FPE_FLTRES;
15438 +                       break;
15439 +       }
15440 +       force_sig_info(SIGFPE, &info, task);
15441 +}
15442 +
15443 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
15444 +                                         long error_code)
15445 +{
15446 +       if (cpu_has_xmm) {
15447 +               /* Handle SIMD FPU exceptions on PIII+ processors. */
15448 +               ignore_fpu_irq = 1;
15449 +               simd_math_error((void __user *)regs->eip);
15450 +       } else {
15451 +               /*
15452 +                * Handle strange cache flush from user space exception
15453 +                * in all other cases.  This is undocumented behaviour.
15454 +                */
15455 +               if (regs->eflags & VM_MASK) {
15456 +                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
15457 +                                         error_code);
15458 +                       return;
15459 +               }
15460 +               current->thread.trap_no = 19;
15461 +               current->thread.error_code = error_code;
15462 +               die_if_kernel("cache flush denied", regs, error_code);
15463 +               force_sig(SIGSEGV, current);
15464 +       }
15465 +}
15466 +
15467 +#ifndef CONFIG_XEN
15468 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
15469 +                                         long error_code)
15470 +{
15471 +#if 0
15472 +       /* No need to warn about this any longer. */
15473 +       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
15474 +#endif
15475 +}
15476 +
15477 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
15478 +{
15479 +       unsigned long *switch16_ptr, *switch32_ptr;
15480 +       struct pt_regs *regs;
15481 +       unsigned long stack_top, stack_bot;
15482 +       unsigned short iret_frame16_off;
15483 +       int cpu = smp_processor_id();
15484 +       /* reserve the space on 32bit stack for the magic switch16 pointer */
15485 +       memmove(stk, stk + 8, sizeof(struct pt_regs));
15486 +       switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
15487 +       regs = (struct pt_regs *)stk;
15488 +       /* now the switch32 on 16bit stack */
15489 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
15490 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
15491 +       switch32_ptr = (unsigned long *)(stack_top - 8);
15492 +       iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
15493 +       /* copy iret frame on 16bit stack */
15494 +       memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
15495 +       /* fill in the switch pointers */
15496 +       switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
15497 +       switch16_ptr[1] = __ESPFIX_SS;
15498 +       switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
15499 +               8 - CPU_16BIT_STACK_SIZE;
15500 +       switch32_ptr[1] = __KERNEL_DS;
15501 +}
15502 +
15503 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
15504 +{
15505 +       unsigned long *switch32_ptr;
15506 +       unsigned char *stack16, *stack32;
15507 +       unsigned long stack_top, stack_bot;
15508 +       int len;
15509 +       int cpu = smp_processor_id();
15510 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
15511 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
15512 +       switch32_ptr = (unsigned long *)(stack_top - 8);
15513 +       /* copy the data from 16bit stack to 32bit stack */
15514 +       len = CPU_16BIT_STACK_SIZE - 8 - sp;
15515 +       stack16 = (unsigned char *)(stack_bot + sp);
15516 +       stack32 = (unsigned char *)
15517 +               (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
15518 +       memcpy(stack32, stack16, len);
15519 +       return stack32;
15520 +}
15521 +#endif
15522 +
15523 +/*
15524 + *  'math_state_restore()' saves the current math information in the
15525 + * old math state array, and gets the new ones from the current task
15526 + *
15527 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
15528 + * Don't touch unless you *really* know how it works.
15529 + *
15530 + * Must be called with kernel preemption disabled (in this case,
15531 + * local interrupts are disabled at the call-site in entry.S).
15532 + */
15533 +asmlinkage void math_state_restore(struct pt_regs regs)
15534 +{
15535 +       struct thread_info *thread = current_thread_info();
15536 +       struct task_struct *tsk = thread->task;
15537 +
15538 +       /* NB. 'clts' is done for us by Xen during virtual trap. */
15539 +       if (!tsk_used_math(tsk))
15540 +               init_fpu(tsk);
15541 +       restore_fpu(tsk);
15542 +       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
15543 +}
15544 +
15545 +#ifndef CONFIG_MATH_EMULATION
15546 +
15547 +asmlinkage void math_emulate(long arg)
15548 +{
15549 +       printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
15550 +       printk(KERN_EMERG "killing %s.\n",current->comm);
15551 +       force_sig(SIGFPE,current);
15552 +       schedule();
15553 +}
15554 +
15555 +#endif /* CONFIG_MATH_EMULATION */
15556 +
15557 +#ifdef CONFIG_X86_F00F_BUG
15558 +void __init trap_init_f00f_bug(void)
15559 +{
15560 +       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
15561 +
15562 +       /*
15563 +        * Update the IDT descriptor and reload the IDT so that
15564 +        * it uses the read-only mapped virtual address.
15565 +        */
15566 +       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
15567 +       load_idt(&idt_descr);
15568 +}
15569 +#endif
15570 +
15571 +
15572 +/*
15573 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
15574 + * for those that specify <dpl>|4 in the second field.
15575 + */
15576 +static trap_info_t trap_table[] = {
15577 +       {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
15578 +       {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
15579 +       {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
15580 +       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },
15581 +       {  5, 0, __KERNEL_CS, (unsigned long)bounds                     },
15582 +       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                 },
15583 +       {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available     },
15584 +       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
15585 +       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                },
15586 +       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present        },
15587 +       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment              },
15588 +       { 13, 0, __KERNEL_CS, (unsigned long)general_protection         },
15589 +       { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault               },
15590 +       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
15591 +       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error          },
15592 +       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check            },
15593 +#ifdef CONFIG_X86_MCE
15594 +       { 18, 0, __KERNEL_CS, (unsigned long)machine_check              },
15595 +#endif
15596 +       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
15597 +       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
15598 +       {  0, 0,           0, 0                                         }
15599 +};
15600 +
15601 +void __init trap_init(void)
15602 +{
15603 +       HYPERVISOR_set_trap_table(trap_table);
15604 +
15605 +       if (cpu_has_fxsr) {
15606 +               /*
15607 +                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
15608 +                * Generates a compile-time "error: zero width for bit-field" if
15609 +                * the alignment is wrong.
15610 +                */
15611 +               struct fxsrAlignAssert {
15612 +                       int _:!(offsetof(struct task_struct,
15613 +                                       thread.i387.fxsave) & 15);
15614 +               };
15615 +
15616 +               printk(KERN_INFO "Enabling fast FPU save and restore... ");
15617 +               set_in_cr4(X86_CR4_OSFXSR);
15618 +               printk("done.\n");
15619 +       }
15620 +       if (cpu_has_xmm) {
15621 +               printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
15622 +                               "support... ");
15623 +               set_in_cr4(X86_CR4_OSXMMEXCPT);
15624 +               printk("done.\n");
15625 +       }
15626 +
15627 +       /*
15628 +        * Should be a barrier for any external CPU state.
15629 +        */
15630 +       cpu_init();
15631 +}
15632 +
15633 +void smp_trap_init(trap_info_t *trap_ctxt)
15634 +{
15635 +       trap_info_t *t = trap_table;
15636 +
15637 +       for (t = trap_table; t->address; t++) {
15638 +               trap_ctxt[t->vector].flags = t->flags;
15639 +               trap_ctxt[t->vector].cs = t->cs;
15640 +               trap_ctxt[t->vector].address = t->address;
15641 +       }
15642 +}
15643 +
15644 +static int __init kstack_setup(char *s)
15645 +{
15646 +       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
15647 +       return 0;
15648 +}
15649 +__setup("kstack=", kstack_setup);
15650 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/vm86.c tmp-linux-2.6-xen.patch/arch/i386/kernel/vm86.c
15651 --- ref-linux-2.6.16.9/arch/i386/kernel/vm86.c  2006-04-19 08:10:14.000000000 +0200
15652 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/vm86.c     2006-04-10 00:05:52.000000000 +0200
15653 @@ -97,7 +97,9 @@
15654  struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
15655  struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
15656  {
15657 +#ifndef CONFIG_X86_NO_TSS
15658         struct tss_struct *tss;
15659 +#endif
15660         struct pt_regs *ret;
15661         unsigned long tmp;
15662  
15663 @@ -122,7 +124,9 @@ struct pt_regs * fastcall save_v86_state
15664                 do_exit(SIGSEGV);
15665         }
15666  
15667 +#ifndef CONFIG_X86_NO_TSS
15668         tss = &per_cpu(init_tss, get_cpu());
15669 +#endif
15670         current->thread.esp0 = current->thread.saved_esp0;
15671         current->thread.sysenter_cs = __KERNEL_CS;
15672         load_esp0(tss, &current->thread);
15673 @@ -251,7 +255,9 @@ out:
15674  
15675  static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
15676  {
15677 +#ifndef CONFIG_X86_NO_TSS
15678         struct tss_struct *tss;
15679 +#endif
15680  /*
15681   * make sure the vm86() system call doesn't try to do anything silly
15682   */
15683 @@ -295,7 +301,9 @@ static void do_sys_vm86(struct kernel_vm
15684         savesegment(fs, tsk->thread.saved_fs);
15685         savesegment(gs, tsk->thread.saved_gs);
15686  
15687 +#ifndef CONFIG_X86_NO_TSS
15688         tss = &per_cpu(init_tss, get_cpu());
15689 +#endif
15690         tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
15691         if (cpu_has_sep)
15692                 tsk->thread.sysenter_cs = 0;
15693 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/vmlinux.lds.S tmp-linux-2.6-xen.patch/arch/i386/kernel/vmlinux.lds.S
15694 --- ref-linux-2.6.16.9/arch/i386/kernel/vmlinux.lds.S   2006-04-19 08:10:14.000000000 +0200
15695 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/vmlinux.lds.S      2006-04-10 00:05:52.000000000 +0200
15696 @@ -34,6 +34,13 @@ SECTIONS
15697    __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
15698    __stop___ex_table = .;
15699  
15700 +  . = ALIGN(16);
15701 +  __start_smp_alternatives_table = .;
15702 +  __smp_alternatives : AT(ADDR(__smp_alternatives) - LOAD_OFFSET) { *(__smp_alternatives) }
15703 +  __stop_smp_alternatives_table = .;
15704 +
15705 +  __smp_replacements : AT(ADDR(__smp_replacements) - LOAD_OFFSET) { *(__smp_replacements) }
15706 +
15707    RODATA
15708  
15709    /* writeable */
15710 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/vsyscall-note-xen.S tmp-linux-2.6-xen.patch/arch/i386/kernel/vsyscall-note-xen.S
15711 --- ref-linux-2.6.16.9/arch/i386/kernel/vsyscall-note-xen.S     1970-01-01 01:00:00.000000000 +0100
15712 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/vsyscall-note-xen.S        2006-04-10 00:05:52.000000000 +0200
15713 @@ -0,0 +1,32 @@
15714 +/*
15715 + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
15716 + * Here we can supply some information useful to userland.
15717 + * First we get the vanilla i386 note that supplies the kernel version info.
15718 + */
15719 +
15720 +#include "vsyscall-note.S"
15721 +
15722 +/*
15723 + * Now we add a special note telling glibc's dynamic linker a fake hardware
15724 + * flavor that it will use to choose the search path for libraries in the
15725 + * same way it uses real hardware capabilities like "mmx".
15726 + * We supply "nosegneg" as the fake capability, to indicate that we
15727 + * do not like negative offsets in instructions using segment overrides,
15728 + * since we implement those inefficiently.  This makes it possible to
15729 + * install libraries optimized to avoid those access patterns in someplace
15730 + * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
15731 + * corresponding to the bits here is needed to make ldconfig work right.
15732 + * It should contain:
15733 + *     hwcap 0 nosegneg
15734 + * to match the mapping of bit to name that we give here.
15735 + */
15736 +#define NOTE_KERNELCAP_BEGIN(ncaps, mask) \
15737 +       ASM_ELF_NOTE_BEGIN(".note.kernelcap", "a", "GNU", 2) \
15738 +       .long ncaps, mask
15739 +#define NOTE_KERNELCAP(bit, name) \
15740 +       .byte bit; .asciz name
15741 +#define NOTE_KERNELCAP_END ASM_ELF_NOTE_END
15742 +
15743 +NOTE_KERNELCAP_BEGIN(1, 1)
15744 +NOTE_KERNELCAP(1, "nosegneg")  /* Change 1 back to 0 when glibc is fixed! */
15745 +NOTE_KERNELCAP_END
15746 diff -Nurp ref-linux-2.6.16.9/arch/i386/kernel/vsyscall.S tmp-linux-2.6-xen.patch/arch/i386/kernel/vsyscall.S
15747 --- ref-linux-2.6.16.9/arch/i386/kernel/vsyscall.S      2006-04-19 08:10:14.000000000 +0200
15748 +++ tmp-linux-2.6-xen.patch/arch/i386/kernel/vsyscall.S 2006-04-10 00:05:52.000000000 +0200
15749 @@ -7,9 +7,11 @@ vsyscall_int80_start:
15750         .incbin "arch/i386/kernel/vsyscall-int80.so"
15751  vsyscall_int80_end:
15752  
15753 +#ifdef CONFIG_X86_SYSENTER
15754         .globl vsyscall_sysenter_start, vsyscall_sysenter_end
15755  vsyscall_sysenter_start:
15756         .incbin "arch/i386/kernel/vsyscall-sysenter.so"
15757  vsyscall_sysenter_end:
15758 +#endif
15759  
15760  __FINIT
15761 diff -Nurp ref-linux-2.6.16.9/arch/i386/mach-xen/Makefile tmp-linux-2.6-xen.patch/arch/i386/mach-xen/Makefile
15762 --- ref-linux-2.6.16.9/arch/i386/mach-xen/Makefile      1970-01-01 01:00:00.000000000 +0100
15763 +++ tmp-linux-2.6-xen.patch/arch/i386/mach-xen/Makefile 2006-04-10 00:05:52.000000000 +0200
15764 @@ -0,0 +1,5 @@
15765 +#
15766 +# Makefile for the linux kernel.
15767 +#
15768 +
15769 +obj-y                          := setup.o
15770 diff -Nurp ref-linux-2.6.16.9/arch/i386/mach-xen/setup.c tmp-linux-2.6-xen.patch/arch/i386/mach-xen/setup.c
15771 --- ref-linux-2.6.16.9/arch/i386/mach-xen/setup.c       1970-01-01 01:00:00.000000000 +0100
15772 +++ tmp-linux-2.6-xen.patch/arch/i386/mach-xen/setup.c  2006-04-10 00:05:52.000000000 +0200
15773 @@ -0,0 +1,37 @@
15774 +/*
15775 + *     Machine specific setup for generic
15776 + */
15777 +
15778 +#include <linux/config.h>
15779 +#include <linux/smp.h>
15780 +#include <linux/init.h>
15781 +#include <linux/interrupt.h>
15782 +#include <asm/acpi.h>
15783 +#include <asm/arch_hooks.h>
15784 +
15785 +#ifdef CONFIG_HOTPLUG_CPU
15786 +#define DEFAULT_SEND_IPI       (1)
15787 +#else
15788 +#define DEFAULT_SEND_IPI       (0)
15789 +#endif
15790 +
15791 +int no_broadcast=DEFAULT_SEND_IPI;
15792 +
15793 +static __init int no_ipi_broadcast(char *str)
15794 +{
15795 +       get_option(&str, &no_broadcast);
15796 +       printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
15797 +                                                                                       "IPI Broadcast");
15798 +       return 1;
15799 +}
15800 +
15801 +__setup("no_ipi_broadcast", no_ipi_broadcast);
15802 +
15803 +static int __init print_ipi_mode(void)
15804 +{
15805 +       printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
15806 +                                                                                       "Shortcut");
15807 +       return 0;
15808 +}
15809 +
15810 +late_initcall(print_ipi_mode);
15811 diff -Nurp ref-linux-2.6.16.9/arch/i386/Makefile tmp-linux-2.6-xen.patch/arch/i386/Makefile
15812 --- ref-linux-2.6.16.9/arch/i386/Makefile       2006-04-19 08:10:14.000000000 +0200
15813 +++ tmp-linux-2.6-xen.patch/arch/i386/Makefile  2006-04-10 00:05:52.000000000 +0200
15814 @@ -45,6 +45,11 @@ CFLAGS                               += $(shell if [ $(call cc-vers
15815  
15816  CFLAGS += $(cflags-y)
15817  
15818 +cppflags-$(CONFIG_XEN) += \
15819 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
15820 +
15821 +CPPFLAGS += $(cppflags-y)
15822 +
15823  # Default subarch .c files
15824  mcore-y  := mach-default
15825  
15826 @@ -68,6 +73,10 @@ mcore-$(CONFIG_X86_BIGSMP)   := mach-defau
15827  mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
15828  mcore-$(CONFIG_X86_SUMMIT)  := mach-default
15829  
15830 +# Xen subarch support
15831 +mflags-$(CONFIG_X86_XEN)       := -Iinclude/asm-i386/mach-xen
15832 +mcore-$(CONFIG_X86_XEN)                := mach-xen
15833 +
15834  # generic subarchitecture
15835  mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
15836  mcore-$(CONFIG_X86_GENERICARCH) := mach-default
15837 @@ -102,6 +111,19 @@ boot := arch/i386/boot
15838  .PHONY: zImage bzImage compressed zlilo bzlilo \
15839         zdisk bzdisk fdimage fdimage144 fdimage288 install
15840  
15841 +ifdef CONFIG_XEN
15842 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
15843 +head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task-xen.o
15844 +boot := arch/i386/boot-xen
15845 +.PHONY: vmlinuz
15846 +all: vmlinuz
15847 +
15848 +vmlinuz: vmlinux
15849 +       $(Q)$(MAKE) $(build)=$(boot) $@
15850 +
15851 +install:
15852 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
15853 +else
15854  all: bzImage
15855  
15856  # KBUILD_IMAGE specify target image being built
15857 @@ -124,6 +146,7 @@ fdimage fdimage144 fdimage288: vmlinux
15858  
15859  install:
15860         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
15861 +endif
15862  
15863  archclean:
15864         $(Q)$(MAKE) $(clean)=arch/i386/boot
15865 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/fault-xen.c tmp-linux-2.6-xen.patch/arch/i386/mm/fault-xen.c
15866 --- ref-linux-2.6.16.9/arch/i386/mm/fault-xen.c 1970-01-01 01:00:00.000000000 +0100
15867 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/fault-xen.c    2006-04-10 00:05:52.000000000 +0200
15868 @@ -0,0 +1,617 @@
15869 +/*
15870 + *  linux/arch/i386/mm/fault.c
15871 + *
15872 + *  Copyright (C) 1995  Linus Torvalds
15873 + */
15874 +
15875 +#include <linux/signal.h>
15876 +#include <linux/sched.h>
15877 +#include <linux/kernel.h>
15878 +#include <linux/errno.h>
15879 +#include <linux/string.h>
15880 +#include <linux/types.h>
15881 +#include <linux/ptrace.h>
15882 +#include <linux/mman.h>
15883 +#include <linux/mm.h>
15884 +#include <linux/smp.h>
15885 +#include <linux/smp_lock.h>
15886 +#include <linux/interrupt.h>
15887 +#include <linux/init.h>
15888 +#include <linux/tty.h>
15889 +#include <linux/vt_kern.h>             /* For unblank_screen() */
15890 +#include <linux/highmem.h>
15891 +#include <linux/module.h>
15892 +#include <linux/kprobes.h>
15893 +
15894 +#include <asm/system.h>
15895 +#include <asm/uaccess.h>
15896 +#include <asm/desc.h>
15897 +#include <asm/kdebug.h>
15898 +
15899 +extern void die(const char *,struct pt_regs *,long);
15900 +
15901 +/*
15902 + * Unlock any spinlocks which will prevent us from getting the
15903 + * message out 
15904 + */
15905 +void bust_spinlocks(int yes)
15906 +{
15907 +       int loglevel_save = console_loglevel;
15908 +
15909 +       if (yes) {
15910 +               oops_in_progress = 1;
15911 +               return;
15912 +       }
15913 +#ifdef CONFIG_VT
15914 +       unblank_screen();
15915 +#endif
15916 +       oops_in_progress = 0;
15917 +       /*
15918 +        * OK, the message is on the console.  Now we call printk()
15919 +        * without oops_in_progress set so that printk will give klogd
15920 +        * a poke.  Hold onto your hats...
15921 +        */
15922 +       console_loglevel = 15;          /* NMI oopser may have shut the console up */
15923 +       printk(" ");
15924 +       console_loglevel = loglevel_save;
15925 +}
15926 +
15927 +/*
15928 + * Return EIP plus the CS segment base.  The segment limit is also
15929 + * adjusted, clamped to the kernel/user address space (whichever is
15930 + * appropriate), and returned in *eip_limit.
15931 + *
15932 + * The segment is checked, because it might have been changed by another
15933 + * task between the original faulting instruction and here.
15934 + *
15935 + * If CS is no longer a valid code segment, or if EIP is beyond the
15936 + * limit, or if it is a kernel address when CS is not a kernel segment,
15937 + * then the returned value will be greater than *eip_limit.
15938 + * 
15939 + * This is slow, but is very rarely executed.
15940 + */
15941 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
15942 +                                           unsigned long *eip_limit)
15943 +{
15944 +       unsigned long eip = regs->eip;
15945 +       unsigned seg = regs->xcs & 0xffff;
15946 +       u32 seg_ar, seg_limit, base, *desc;
15947 +
15948 +       /* The standard kernel/user address space limit. */
15949 +       *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
15950 +
15951 +       /* Unlikely, but must come before segment checks. */
15952 +       if (unlikely((regs->eflags & VM_MASK) != 0))
15953 +               return eip + (seg << 4);
15954 +       
15955 +       /* By far the most common cases. */
15956 +       if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
15957 +               return eip;
15958 +
15959 +       /* Check the segment exists, is within the current LDT/GDT size,
15960 +          that kernel/user (ring 0..3) has the appropriate privilege,
15961 +          that it's a code segment, and get the limit. */
15962 +       __asm__ ("larl %3,%0; lsll %3,%1"
15963 +                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
15964 +       if ((~seg_ar & 0x9800) || eip > seg_limit) {
15965 +               *eip_limit = 0;
15966 +               return 1;        /* So that returned eip > *eip_limit. */
15967 +       }
15968 +
15969 +       /* Get the GDT/LDT descriptor base. 
15970 +          When you look for races in this code remember that
15971 +          LDT and other horrors are only used in user space. */
15972 +       if (seg & (1<<2)) {
15973 +               /* Must lock the LDT while reading it. */
15974 +               down(&current->mm->context.sem);
15975 +               desc = current->mm->context.ldt;
15976 +               desc = (void *)desc + (seg & ~7);
15977 +       } else {
15978 +               /* Must disable preemption while reading the GDT. */
15979 +               desc = (u32 *)get_cpu_gdt_table(get_cpu());
15980 +               desc = (void *)desc + (seg & ~7);
15981 +       }
15982 +
15983 +       /* Decode the code segment base from the descriptor */
15984 +       base = get_desc_base((unsigned long *)desc);
15985 +
15986 +       if (seg & (1<<2)) { 
15987 +               up(&current->mm->context.sem);
15988 +       } else
15989 +               put_cpu();
15990 +
15991 +       /* Adjust EIP and segment limit, and clamp at the kernel limit.
15992 +          It's legitimate for segments to wrap at 0xffffffff. */
15993 +       seg_limit += base;
15994 +       if (seg_limit < *eip_limit && seg_limit >= base)
15995 +               *eip_limit = seg_limit;
15996 +       return eip + base;
15997 +}
15998 +
15999 +/* 
16000 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
16001 + * Check that here and ignore it.
16002 + */
16003 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
16004 +{ 
16005 +       unsigned long limit;
16006 +       unsigned long instr = get_segment_eip (regs, &limit);
16007 +       int scan_more = 1;
16008 +       int prefetch = 0; 
16009 +       int i;
16010 +
16011 +       for (i = 0; scan_more && i < 15; i++) { 
16012 +               unsigned char opcode;
16013 +               unsigned char instr_hi;
16014 +               unsigned char instr_lo;
16015 +
16016 +               if (instr > limit)
16017 +                       break;
16018 +               if (__get_user(opcode, (unsigned char __user *) instr))
16019 +                       break; 
16020 +
16021 +               instr_hi = opcode & 0xf0; 
16022 +               instr_lo = opcode & 0x0f; 
16023 +               instr++;
16024 +
16025 +               switch (instr_hi) { 
16026 +               case 0x20:
16027 +               case 0x30:
16028 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
16029 +                       scan_more = ((instr_lo & 7) == 0x6);
16030 +                       break;
16031 +                       
16032 +               case 0x60:
16033 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
16034 +                       scan_more = (instr_lo & 0xC) == 0x4;
16035 +                       break;          
16036 +               case 0xF0:
16037 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
16038 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
16039 +                       break;                  
16040 +               case 0x00:
16041 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
16042 +                       scan_more = 0;
16043 +                       if (instr > limit)
16044 +                               break;
16045 +                       if (__get_user(opcode, (unsigned char __user *) instr))
16046 +                               break;
16047 +                       prefetch = (instr_lo == 0xF) &&
16048 +                               (opcode == 0x0D || opcode == 0x18);
16049 +                       break;                  
16050 +               default:
16051 +                       scan_more = 0;
16052 +                       break;
16053 +               } 
16054 +       }
16055 +       return prefetch;
16056 +}
16057 +
16058 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
16059 +                             unsigned long error_code)
16060 +{
16061 +       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
16062 +                    boot_cpu_data.x86 >= 6)) {
16063 +               /* Catch an obscure case of prefetch inside an NX page. */
16064 +               if (nx_enabled && (error_code & 16))
16065 +                       return 0;
16066 +               return __is_prefetch(regs, addr);
16067 +       }
16068 +       return 0;
16069 +} 
16070 +
16071 +static noinline void force_sig_info_fault(int si_signo, int si_code,
16072 +       unsigned long address, struct task_struct *tsk)
16073 +{
16074 +       siginfo_t info;
16075 +
16076 +       info.si_signo = si_signo;
16077 +       info.si_errno = 0;
16078 +       info.si_code = si_code;
16079 +       info.si_addr = (void __user *)address;
16080 +       force_sig_info(si_signo, &info, tsk);
16081 +}
16082 +
16083 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
16084 +
16085 +#ifdef CONFIG_X86_PAE
16086 +static void dump_fault_path(unsigned long address)
16087 +{
16088 +       unsigned long *p, page;
16089 +       unsigned long mfn; 
16090 +
16091 +       page = read_cr3();
16092 +       p  = (unsigned long *)__va(page);
16093 +       p += (address >> 30) * 2;
16094 +       printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
16095 +       if (p[0] & 1) {
16096 +               mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
16097 +               page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
16098 +               p  = (unsigned long *)__va(page);
16099 +               address &= 0x3fffffff;
16100 +               p += (address >> 21) * 2;
16101 +               printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", 
16102 +                      page, p[1], p[0]);
16103 +#ifndef CONFIG_HIGHPTE
16104 +               if (p[0] & 1) {
16105 +                       mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
16106 +                       page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
16107 +                       p  = (unsigned long *) __va(page);
16108 +                       address &= 0x001fffff;
16109 +                       p += (address >> 12) * 2;
16110 +                       printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
16111 +                              page, p[1], p[0]);
16112 +               }
16113 +#endif
16114 +       }
16115 +}
16116 +#else
16117 +static void dump_fault_path(unsigned long address)
16118 +{
16119 +       unsigned long page;
16120 +
16121 +       page = read_cr3();
16122 +       page = ((unsigned long *) __va(page))[address >> 22];
16123 +       printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
16124 +              machine_to_phys(page));
16125 +       /*
16126 +        * We must not directly access the pte in the highpte
16127 +        * case, the page table might be allocated in highmem.
16128 +        * And lets rather not kmap-atomic the pte, just in case
16129 +        * it's allocated already.
16130 +        */
16131 +#ifndef CONFIG_HIGHPTE
16132 +       if (page & 1) {
16133 +               page &= PAGE_MASK;
16134 +               address &= 0x003ff000;
16135 +               page = machine_to_phys(page);
16136 +               page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
16137 +               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
16138 +                      machine_to_phys(page));
16139 +       }
16140 +#endif
16141 +}
16142 +#endif
16143 +
16144 +
16145 +/*
16146 + * This routine handles page faults.  It determines the address,
16147 + * and the problem, and then passes it off to one of the appropriate
16148 + * routines.
16149 + *
16150 + * error_code:
16151 + *     bit 0 == 0 means no page found, 1 means protection fault
16152 + *     bit 1 == 0 means read, 1 means write
16153 + *     bit 2 == 0 means kernel, 1 means user-mode
16154 + */
16155 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
16156 +                                     unsigned long error_code)
16157 +{
16158 +       struct task_struct *tsk;
16159 +       struct mm_struct *mm;
16160 +       struct vm_area_struct * vma;
16161 +       unsigned long address;
16162 +       int write, si_code;
16163 +
16164 +       /* get the address */
16165 +        address = read_cr2();
16166 +
16167 +       /* Set the "privileged fault" bit to something sane. */
16168 +       error_code &= ~4;
16169 +       error_code |= (regs->xcs & 2) << 1;
16170 +       if (regs->eflags & X86_EFLAGS_VM)
16171 +               error_code |= 4;
16172 +
16173 +       if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
16174 +                                       SIGSEGV) == NOTIFY_STOP)
16175 +               return;
16176 +       /* It's safe to allow irq's after cr2 has been saved */
16177 +       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
16178 +               local_irq_enable();
16179 +
16180 +       tsk = current;
16181 +
16182 +       si_code = SEGV_MAPERR;
16183 +
16184 +       /*
16185 +        * We fault-in kernel-space virtual memory on-demand. The
16186 +        * 'reference' page table is init_mm.pgd.
16187 +        *
16188 +        * NOTE! We MUST NOT take any locks for this case. We may
16189 +        * be in an interrupt or a critical region, and should
16190 +        * only copy the information from the master page table,
16191 +        * nothing more.
16192 +        *
16193 +        * This verifies that the fault happens in kernel space
16194 +        * (error_code & 4) == 0, and that the fault was not a
16195 +        * protection error (error_code & 1) == 0.
16196 +        */
16197 +       if (unlikely(address >= TASK_SIZE)) { 
16198 +               if (!(error_code & 5))
16199 +                       goto vmalloc_fault;
16200 +               /* 
16201 +                * Don't take the mm semaphore here. If we fixup a prefetch
16202 +                * fault we could otherwise deadlock.
16203 +                */
16204 +               goto bad_area_nosemaphore;
16205 +       } 
16206 +
16207 +       mm = tsk->mm;
16208 +
16209 +       /*
16210 +        * If we're in an interrupt, have no user context or are running in an
16211 +        * atomic region then we must not take the fault..
16212 +        */
16213 +       if (in_atomic() || !mm)
16214 +               goto bad_area_nosemaphore;
16215 +
16216 +       /* When running in the kernel we expect faults to occur only to
16217 +        * addresses in user space.  All other faults represent errors in the
16218 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
16219 +        * erroneous fault occuring in a code path which already holds mmap_sem
16220 +        * we will deadlock attempting to validate the fault against the
16221 +        * address space.  Luckily the kernel only validly references user
16222 +        * space from well defined areas of code, which are listed in the
16223 +        * exceptions table.
16224 +        *
16225 +        * As the vast majority of faults will be valid we will only perform
16226 +        * the source reference check when there is a possibilty of a deadlock.
16227 +        * Attempt to lock the address space, if we cannot we then validate the
16228 +        * source.  If this is invalid we can skip the address space check,
16229 +        * thus avoiding the deadlock.
16230 +        */
16231 +       if (!down_read_trylock(&mm->mmap_sem)) {
16232 +               if ((error_code & 4) == 0 &&
16233 +                   !search_exception_tables(regs->eip))
16234 +                       goto bad_area_nosemaphore;
16235 +               down_read(&mm->mmap_sem);
16236 +       }
16237 +
16238 +       vma = find_vma(mm, address);
16239 +       if (!vma)
16240 +               goto bad_area;
16241 +       if (vma->vm_start <= address)
16242 +               goto good_area;
16243 +       if (!(vma->vm_flags & VM_GROWSDOWN))
16244 +               goto bad_area;
16245 +       if (error_code & 4) {
16246 +               /*
16247 +                * accessing the stack below %esp is always a bug.
16248 +                * The "+ 32" is there due to some instructions (like
16249 +                * pusha) doing post-decrement on the stack and that
16250 +                * doesn't show up until later..
16251 +                */
16252 +               if (address + 32 < regs->esp)
16253 +                       goto bad_area;
16254 +       }
16255 +       if (expand_stack(vma, address))
16256 +               goto bad_area;
16257 +/*
16258 + * Ok, we have a good vm_area for this memory access, so
16259 + * we can handle it..
16260 + */
16261 +good_area:
16262 +       si_code = SEGV_ACCERR;
16263 +       write = 0;
16264 +       switch (error_code & 3) {
16265 +               default:        /* 3: write, present */
16266 +#ifdef TEST_VERIFY_AREA
16267 +                       if (regs->cs == GET_KERNEL_CS())
16268 +                               printk("WP fault at %08lx\n", regs->eip);
16269 +#endif
16270 +                       /* fall through */
16271 +               case 2:         /* write, not present */
16272 +                       if (!(vma->vm_flags & VM_WRITE))
16273 +                               goto bad_area;
16274 +                       write++;
16275 +                       break;
16276 +               case 1:         /* read, present */
16277 +                       goto bad_area;
16278 +               case 0:         /* read, not present */
16279 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
16280 +                               goto bad_area;
16281 +       }
16282 +
16283 + survive:
16284 +       /*
16285 +        * If for any reason at all we couldn't handle the fault,
16286 +        * make sure we exit gracefully rather than endlessly redo
16287 +        * the fault.
16288 +        */
16289 +       switch (handle_mm_fault(mm, vma, address, write)) {
16290 +               case VM_FAULT_MINOR:
16291 +                       tsk->min_flt++;
16292 +                       break;
16293 +               case VM_FAULT_MAJOR:
16294 +                       tsk->maj_flt++;
16295 +                       break;
16296 +               case VM_FAULT_SIGBUS:
16297 +                       goto do_sigbus;
16298 +               case VM_FAULT_OOM:
16299 +                       goto out_of_memory;
16300 +               default:
16301 +                       BUG();
16302 +       }
16303 +
16304 +       /*
16305 +        * Did it hit the DOS screen memory VA from vm86 mode?
16306 +        */
16307 +       if (regs->eflags & VM_MASK) {
16308 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
16309 +               if (bit < 32)
16310 +                       tsk->thread.screen_bitmap |= 1 << bit;
16311 +       }
16312 +       up_read(&mm->mmap_sem);
16313 +       return;
16314 +
16315 +/*
16316 + * Something tried to access memory that isn't in our memory map..
16317 + * Fix it, but check if it's kernel or user first..
16318 + */
16319 +bad_area:
16320 +       up_read(&mm->mmap_sem);
16321 +
16322 +bad_area_nosemaphore:
16323 +       /* User mode accesses just cause a SIGSEGV */
16324 +       if (error_code & 4) {
16325 +               /* 
16326 +                * Valid to do another page fault here because this one came 
16327 +                * from user space.
16328 +                */
16329 +               if (is_prefetch(regs, address, error_code))
16330 +                       return;
16331 +
16332 +               tsk->thread.cr2 = address;
16333 +               /* Kernel addresses are always protection faults */
16334 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
16335 +               tsk->thread.trap_no = 14;
16336 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
16337 +               return;
16338 +       }
16339 +
16340 +#ifdef CONFIG_X86_F00F_BUG
16341 +       /*
16342 +        * Pentium F0 0F C7 C8 bug workaround.
16343 +        */
16344 +       if (boot_cpu_data.f00f_bug) {
16345 +               unsigned long nr;
16346 +               
16347 +               nr = (address - idt_descr.address) >> 3;
16348 +
16349 +               if (nr == 6) {
16350 +                       do_invalid_op(regs, 0);
16351 +                       return;
16352 +               }
16353 +       }
16354 +#endif
16355 +
16356 +no_context:
16357 +       /* Are we prepared to handle this kernel fault?  */
16358 +       if (fixup_exception(regs))
16359 +               return;
16360 +
16361 +       /* 
16362 +        * Valid to do another page fault here, because if this fault
16363 +        * had been triggered by is_prefetch fixup_exception would have 
16364 +        * handled it.
16365 +        */
16366 +       if (is_prefetch(regs, address, error_code))
16367 +               return;
16368 +
16369 +/*
16370 + * Oops. The kernel tried to access some bad page. We'll have to
16371 + * terminate things with extreme prejudice.
16372 + */
16373 +
16374 +       bust_spinlocks(1);
16375 +
16376 +#ifdef CONFIG_X86_PAE
16377 +       if (error_code & 16) {
16378 +               pte_t *pte = lookup_address(address);
16379 +
16380 +               if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
16381 +                       printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
16382 +       }
16383 +#endif
16384 +       if (address < PAGE_SIZE)
16385 +               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
16386 +       else
16387 +               printk(KERN_ALERT "Unable to handle kernel paging request");
16388 +       printk(" at virtual address %08lx\n",address);
16389 +       printk(KERN_ALERT " printing eip:\n");
16390 +       printk("%08lx\n", regs->eip);
16391 +       dump_fault_path(address);
16392 +       tsk->thread.cr2 = address;
16393 +       tsk->thread.trap_no = 14;
16394 +       tsk->thread.error_code = error_code;
16395 +       die("Oops", regs, error_code);
16396 +       bust_spinlocks(0);
16397 +       do_exit(SIGKILL);
16398 +
16399 +/*
16400 + * We ran out of memory, or some other thing happened to us that made
16401 + * us unable to handle the page fault gracefully.
16402 + */
16403 +out_of_memory:
16404 +       up_read(&mm->mmap_sem);
16405 +       if (tsk->pid == 1) {
16406 +               yield();
16407 +               down_read(&mm->mmap_sem);
16408 +               goto survive;
16409 +       }
16410 +       printk("VM: killing process %s\n", tsk->comm);
16411 +       if (error_code & 4)
16412 +               do_exit(SIGKILL);
16413 +       goto no_context;
16414 +
16415 +do_sigbus:
16416 +       up_read(&mm->mmap_sem);
16417 +
16418 +       /* Kernel mode? Handle exceptions or die */
16419 +       if (!(error_code & 4))
16420 +               goto no_context;
16421 +
16422 +       /* User space => ok to do another page fault */
16423 +       if (is_prefetch(regs, address, error_code))
16424 +               return;
16425 +
16426 +       tsk->thread.cr2 = address;
16427 +       tsk->thread.error_code = error_code;
16428 +       tsk->thread.trap_no = 14;
16429 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
16430 +       return;
16431 +
16432 +vmalloc_fault:
16433 +       {
16434 +               /*
16435 +                * Synchronize this task's top level page-table
16436 +                * with the 'reference' page table.
16437 +                *
16438 +                * Do _not_ use "tsk" here. We might be inside
16439 +                * an interrupt in the middle of a task switch..
16440 +                */
16441 +               int index = pgd_index(address);
16442 +               unsigned long pgd_paddr;
16443 +               pgd_t *pgd, *pgd_k;
16444 +               pud_t *pud, *pud_k;
16445 +               pmd_t *pmd, *pmd_k;
16446 +               pte_t *pte_k;
16447 +
16448 +               pgd_paddr = read_cr3();
16449 +               pgd = index + (pgd_t *)__va(pgd_paddr);
16450 +               pgd_k = init_mm.pgd + index;
16451 +
16452 +               if (!pgd_present(*pgd_k))
16453 +                       goto no_context;
16454 +
16455 +               /*
16456 +                * set_pgd(pgd, *pgd_k); here would be useless on PAE
16457 +                * and redundant with the set_pmd() on non-PAE. As would
16458 +                * set_pud.
16459 +                */
16460 +
16461 +               pud = pud_offset(pgd, address);
16462 +               pud_k = pud_offset(pgd_k, address);
16463 +               if (!pud_present(*pud_k))
16464 +                       goto no_context;
16465 +               
16466 +               pmd = pmd_offset(pud, address);
16467 +               pmd_k = pmd_offset(pud_k, address);
16468 +               if (!pmd_present(*pmd_k))
16469 +                       goto no_context;
16470 +#ifndef CONFIG_XEN
16471 +               set_pmd(pmd, *pmd_k);
16472 +#else
16473 +               /*
16474 +                * When running on Xen we must launder *pmd_k through
16475 +                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
16476 +                */
16477 +               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
16478 +#endif
16479 +
16480 +               pte_k = pte_offset_kernel(pmd_k, address);
16481 +               if (!pte_present(*pte_k))
16482 +                       goto no_context;
16483 +               return;
16484 +       }
16485 +}
16486 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/highmem-xen.c tmp-linux-2.6-xen.patch/arch/i386/mm/highmem-xen.c
16487 --- ref-linux-2.6.16.9/arch/i386/mm/highmem-xen.c       1970-01-01 01:00:00.000000000 +0100
16488 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/highmem-xen.c  2006-04-10 00:05:52.000000000 +0200
16489 @@ -0,0 +1,123 @@
16490 +#include <linux/highmem.h>
16491 +#include <linux/module.h>
16492 +
16493 +void *kmap(struct page *page)
16494 +{
16495 +       might_sleep();
16496 +       if (!PageHighMem(page))
16497 +               return page_address(page);
16498 +       return kmap_high(page);
16499 +}
16500 +
16501 +void kunmap(struct page *page)
16502 +{
16503 +       if (in_interrupt())
16504 +               BUG();
16505 +       if (!PageHighMem(page))
16506 +               return;
16507 +       kunmap_high(page);
16508 +}
16509 +
16510 +/*
16511 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
16512 + * no global lock is needed and because the kmap code must perform a global TLB
16513 + * invalidation when the kmap pool wraps.
16514 + *
16515 + * However when holding an atomic kmap is is not legal to sleep, so atomic
16516 + * kmaps are appropriate for short, tight code paths only.
16517 + */
16518 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
16519 +{
16520 +       enum fixed_addresses idx;
16521 +       unsigned long vaddr;
16522 +
16523 +       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
16524 +       inc_preempt_count();
16525 +       if (!PageHighMem(page))
16526 +               return page_address(page);
16527 +
16528 +       idx = type + KM_TYPE_NR*smp_processor_id();
16529 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
16530 +#ifdef CONFIG_DEBUG_HIGHMEM
16531 +       if (!pte_none(*(kmap_pte-idx)))
16532 +               BUG();
16533 +#endif
16534 +       set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
16535 +
16536 +       return (void*) vaddr;
16537 +}
16538 +
16539 +void *kmap_atomic(struct page *page, enum km_type type)
16540 +{
16541 +       return __kmap_atomic(page, type, kmap_prot);
16542 +}
16543 +
16544 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
16545 +void *kmap_atomic_pte(struct page *page, enum km_type type)
16546 +{
16547 +       return __kmap_atomic(page, type, PAGE_KERNEL_RO);
16548 +}
16549 +
16550 +void kunmap_atomic(void *kvaddr, enum km_type type)
16551 +{
16552 +#ifdef CONFIG_DEBUG_HIGHMEM
16553 +       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
16554 +       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
16555 +
16556 +       if (vaddr < FIXADDR_START) { // FIXME
16557 +               dec_preempt_count();
16558 +               preempt_check_resched();
16559 +               return;
16560 +       }
16561 +
16562 +       if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
16563 +               BUG();
16564 +
16565 +       /*
16566 +        * force other mappings to Oops if they'll try to access
16567 +        * this pte without first remap it
16568 +        */
16569 +       pte_clear(&init_mm, vaddr, kmap_pte-idx);
16570 +       __flush_tlb_one(vaddr);
16571 +#endif
16572 +
16573 +       dec_preempt_count();
16574 +       preempt_check_resched();
16575 +}
16576 +
16577 +/* This is the same as kmap_atomic() but can map memory that doesn't
16578 + * have a struct page associated with it.
16579 + */
16580 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
16581 +{
16582 +       enum fixed_addresses idx;
16583 +       unsigned long vaddr;
16584 +
16585 +       inc_preempt_count();
16586 +
16587 +       idx = type + KM_TYPE_NR*smp_processor_id();
16588 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
16589 +       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
16590 +       __flush_tlb_one(vaddr);
16591 +
16592 +       return (void*) vaddr;
16593 +}
16594 +
16595 +struct page *kmap_atomic_to_page(void *ptr)
16596 +{
16597 +       unsigned long idx, vaddr = (unsigned long)ptr;
16598 +       pte_t *pte;
16599 +
16600 +       if (vaddr < FIXADDR_START)
16601 +               return virt_to_page(ptr);
16602 +
16603 +       idx = virt_to_fix(vaddr);
16604 +       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
16605 +       return pte_page(*pte);
16606 +}
16607 +
16608 +EXPORT_SYMBOL(kmap);
16609 +EXPORT_SYMBOL(kunmap);
16610 +EXPORT_SYMBOL(kmap_atomic);
16611 +EXPORT_SYMBOL(kunmap_atomic);
16612 +EXPORT_SYMBOL(kmap_atomic_to_page);
16613 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/hypervisor.c tmp-linux-2.6-xen.patch/arch/i386/mm/hypervisor.c
16614 --- ref-linux-2.6.16.9/arch/i386/mm/hypervisor.c        1970-01-01 01:00:00.000000000 +0100
16615 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/hypervisor.c   2006-04-10 00:05:52.000000000 +0200
16616 @@ -0,0 +1,424 @@
16617 +/******************************************************************************
16618 + * mm/hypervisor.c
16619 + * 
16620 + * Update page tables via the hypervisor.
16621 + * 
16622 + * Copyright (c) 2002-2004, K A Fraser
16623 + * 
16624 + * This program is free software; you can redistribute it and/or
16625 + * modify it under the terms of the GNU General Public License version 2
16626 + * as published by the Free Software Foundation; or, when distributed
16627 + * separately from the Linux kernel or incorporated into other
16628 + * software packages, subject to the following license:
16629 + * 
16630 + * Permission is hereby granted, free of charge, to any person obtaining a copy
16631 + * of this source file (the "Software"), to deal in the Software without
16632 + * restriction, including without limitation the rights to use, copy, modify,
16633 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
16634 + * and to permit persons to whom the Software is furnished to do so, subject to
16635 + * the following conditions:
16636 + * 
16637 + * The above copyright notice and this permission notice shall be included in
16638 + * all copies or substantial portions of the Software.
16639 + * 
16640 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16641 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16642 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16643 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16644 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
16645 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
16646 + * IN THE SOFTWARE.
16647 + */
16648 +
16649 +#include <linux/config.h>
16650 +#include <linux/sched.h>
16651 +#include <linux/mm.h>
16652 +#include <linux/vmalloc.h>
16653 +#include <asm/page.h>
16654 +#include <asm/pgtable.h>
16655 +#include <asm/hypervisor.h>
16656 +#include <xen/balloon.h>
16657 +#include <xen/features.h>
16658 +#include <xen/interface/memory.h>
16659 +#include <linux/module.h>
16660 +#include <linux/percpu.h>
16661 +#include <asm/tlbflush.h>
16662 +
16663 +#ifdef CONFIG_X86_64
16664 +#define pmd_val_ma(v) (v).pmd
16665 +#else
16666 +#ifdef CONFIG_X86_PAE
16667 +# define pmd_val_ma(v) ((v).pmd)
16668 +# define pud_val_ma(v) ((v).pgd.pgd)
16669 +#else
16670 +# define pmd_val_ma(v) ((v).pud.pgd.pgd)
16671 +#endif
16672 +#endif
16673 +
16674 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
16675 +{
16676 +       mmu_update_t u;
16677 +       u.ptr = virt_to_machine(ptr);
16678 +       u.val = pte_val_ma(val);
16679 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16680 +}
16681 +
16682 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
16683 +{
16684 +       mmu_update_t u;
16685 +       u.ptr = virt_to_machine(ptr);
16686 +       u.val = pmd_val_ma(val);
16687 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16688 +}
16689 +
16690 +#ifdef CONFIG_X86_PAE
16691 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
16692 +{
16693 +       mmu_update_t u;
16694 +       u.ptr = virt_to_machine(ptr);
16695 +       u.val = pud_val_ma(val);
16696 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16697 +}
16698 +#endif
16699 +
16700 +#ifdef CONFIG_X86_64
16701 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
16702 +{
16703 +       mmu_update_t u;
16704 +       u.ptr = virt_to_machine(ptr);
16705 +       u.val = val.pud;
16706 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16707 +}
16708 +
16709 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
16710 +{
16711 +       mmu_update_t u;
16712 +       u.ptr = virt_to_machine(ptr);
16713 +       u.val = val.pgd;
16714 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16715 +}
16716 +#endif /* CONFIG_X86_64 */
16717 +
16718 +void xen_machphys_update(unsigned long mfn, unsigned long pfn)
16719 +{
16720 +       mmu_update_t u;
16721 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
16722 +               BUG_ON(pfn != mfn);
16723 +               return;
16724 +       }
16725 +       u.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
16726 +       u.val = pfn;
16727 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16728 +}
16729 +
16730 +void xen_pt_switch(unsigned long ptr)
16731 +{
16732 +       struct mmuext_op op;
16733 +       op.cmd = MMUEXT_NEW_BASEPTR;
16734 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16735 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16736 +}
16737 +
16738 +void xen_new_user_pt(unsigned long ptr)
16739 +{
16740 +       struct mmuext_op op;
16741 +       op.cmd = MMUEXT_NEW_USER_BASEPTR;
16742 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16743 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16744 +}
16745 +
16746 +void xen_tlb_flush(void)
16747 +{
16748 +       struct mmuext_op op;
16749 +       op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
16750 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16751 +}
16752 +
16753 +void xen_invlpg(unsigned long ptr)
16754 +{
16755 +       struct mmuext_op op;
16756 +       op.cmd = MMUEXT_INVLPG_LOCAL;
16757 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16758 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16759 +}
16760 +
16761 +#ifdef CONFIG_SMP
16762 +
16763 +void xen_tlb_flush_all(void)
16764 +{
16765 +       struct mmuext_op op;
16766 +       op.cmd = MMUEXT_TLB_FLUSH_ALL;
16767 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16768 +}
16769 +
16770 +void xen_tlb_flush_mask(cpumask_t *mask)
16771 +{
16772 +       struct mmuext_op op;
16773 +       if ( cpus_empty(*mask) )
16774 +               return;
16775 +       op.cmd = MMUEXT_TLB_FLUSH_MULTI;
16776 +       op.arg2.vcpumask = mask->bits;
16777 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16778 +}
16779 +
16780 +void xen_invlpg_all(unsigned long ptr)
16781 +{
16782 +       struct mmuext_op op;
16783 +       op.cmd = MMUEXT_INVLPG_ALL;
16784 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16785 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16786 +}
16787 +
16788 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
16789 +{
16790 +       struct mmuext_op op;
16791 +       if ( cpus_empty(*mask) )
16792 +               return;
16793 +       op.cmd = MMUEXT_INVLPG_MULTI;
16794 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16795 +       op.arg2.vcpumask    = mask->bits;
16796 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16797 +}
16798 +
16799 +#endif /* CONFIG_SMP */
16800 +
16801 +void xen_pgd_pin(unsigned long ptr)
16802 +{
16803 +       struct mmuext_op op;
16804 +#ifdef CONFIG_X86_64
16805 +       op.cmd = MMUEXT_PIN_L4_TABLE;
16806 +#elif defined(CONFIG_X86_PAE)
16807 +       op.cmd = MMUEXT_PIN_L3_TABLE;
16808 +#else
16809 +       op.cmd = MMUEXT_PIN_L2_TABLE;
16810 +#endif
16811 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16812 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16813 +}
16814 +
16815 +void xen_pgd_unpin(unsigned long ptr)
16816 +{
16817 +       struct mmuext_op op;
16818 +       op.cmd = MMUEXT_UNPIN_TABLE;
16819 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16820 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16821 +}
16822 +
16823 +void xen_set_ldt(unsigned long ptr, unsigned long len)
16824 +{
16825 +       struct mmuext_op op;
16826 +       op.cmd = MMUEXT_SET_LDT;
16827 +       op.arg1.linear_addr = ptr;
16828 +       op.arg2.nr_ents     = len;
16829 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16830 +}
16831 +
16832 +/*
16833 + * Bitmap is indexed by page number. If bit is set, the page is part of a
16834 + * xen_create_contiguous_region() area of memory.
16835 + */
16836 +unsigned long *contiguous_bitmap;
16837 +
16838 +static void contiguous_bitmap_set(
16839 +       unsigned long first_page, unsigned long nr_pages)
16840 +{
16841 +       unsigned long start_off, end_off, curr_idx, end_idx;
16842 +
16843 +       curr_idx  = first_page / BITS_PER_LONG;
16844 +       start_off = first_page & (BITS_PER_LONG-1);
16845 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
16846 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
16847 +
16848 +       if (curr_idx == end_idx) {
16849 +               contiguous_bitmap[curr_idx] |=
16850 +                       ((1UL<<end_off)-1) & -(1UL<<start_off);
16851 +       } else {
16852 +               contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
16853 +               while ( ++curr_idx < end_idx )
16854 +                       contiguous_bitmap[curr_idx] = ~0UL;
16855 +               contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
16856 +       }
16857 +}
16858 +
16859 +static void contiguous_bitmap_clear(
16860 +       unsigned long first_page, unsigned long nr_pages)
16861 +{
16862 +       unsigned long start_off, end_off, curr_idx, end_idx;
16863 +
16864 +       curr_idx  = first_page / BITS_PER_LONG;
16865 +       start_off = first_page & (BITS_PER_LONG-1);
16866 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
16867 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
16868 +
16869 +       if (curr_idx == end_idx) {
16870 +               contiguous_bitmap[curr_idx] &=
16871 +                       -(1UL<<end_off) | ((1UL<<start_off)-1);
16872 +       } else {
16873 +               contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
16874 +               while ( ++curr_idx != end_idx )
16875 +                       contiguous_bitmap[curr_idx] = 0;
16876 +               contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
16877 +       }
16878 +}
16879 +
16880 +/* Ensure multi-page extents are contiguous in machine memory. */
16881 +int xen_create_contiguous_region(
16882 +       unsigned long vstart, unsigned int order, unsigned int address_bits)
16883 +{
16884 +       pgd_t         *pgd; 
16885 +       pud_t         *pud; 
16886 +       pmd_t         *pmd;
16887 +       pte_t         *pte;
16888 +       unsigned long  frame, i, flags;
16889 +       struct xen_memory_reservation reservation = {
16890 +               .extent_start = &frame,
16891 +               .nr_extents   = 1,
16892 +               .extent_order = 0,
16893 +               .domid        = DOMID_SELF
16894 +       };
16895 +
16896 +       /*
16897 +        * Currently an auto-translated guest will not perform I/O, nor will
16898 +        * it require PAE page directories below 4GB. Therefore any calls to
16899 +        * this function are redundant and can be ignored.
16900 +        */
16901 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16902 +               return 0;
16903 +
16904 +       scrub_pages(vstart, 1 << order);
16905 +
16906 +       balloon_lock(flags);
16907 +
16908 +       /* 1. Zap current PTEs, giving away the underlying pages. */
16909 +       for (i = 0; i < (1<<order); i++) {
16910 +               pgd = pgd_offset_k(vstart + (i*PAGE_SIZE));
16911 +               pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
16912 +               pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
16913 +               pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
16914 +               frame = pte_mfn(*pte);
16915 +               BUG_ON(HYPERVISOR_update_va_mapping(
16916 +                       vstart + (i*PAGE_SIZE), __pte_ma(0), 0));
16917 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
16918 +                       INVALID_P2M_ENTRY);
16919 +               BUG_ON(HYPERVISOR_memory_op(
16920 +                       XENMEM_decrease_reservation, &reservation) != 1);
16921 +       }
16922 +
16923 +       /* 2. Get a new contiguous memory extent. */
16924 +       reservation.extent_order = order;
16925 +       reservation.address_bits = address_bits;
16926 +       frame = __pa(vstart) >> PAGE_SHIFT;
16927 +       if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
16928 +                                &reservation) != 1)
16929 +               goto fail;
16930 +
16931 +       /* 3. Map the new extent in place of old pages. */
16932 +       for (i = 0; i < (1<<order); i++) {
16933 +               BUG_ON(HYPERVISOR_update_va_mapping(
16934 +                       vstart + (i*PAGE_SIZE),
16935 +                       pfn_pte_ma(frame+i, PAGE_KERNEL), 0));
16936 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame+i);
16937 +       }
16938 +
16939 +       flush_tlb_all();
16940 +
16941 +       contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
16942 +
16943 +       balloon_unlock(flags);
16944 +
16945 +       return 0;
16946 +
16947 + fail:
16948 +       reservation.extent_order = 0;
16949 +       reservation.address_bits = 0;
16950 +
16951 +       for (i = 0; i < (1<<order); i++) {
16952 +               frame = (__pa(vstart) >> PAGE_SHIFT) + i;
16953 +               BUG_ON(HYPERVISOR_memory_op(
16954 +                       XENMEM_populate_physmap, &reservation) != 1);
16955 +               BUG_ON(HYPERVISOR_update_va_mapping(
16956 +                       vstart + (i*PAGE_SIZE),
16957 +                       pfn_pte_ma(frame, PAGE_KERNEL), 0));
16958 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
16959 +       }
16960 +
16961 +       flush_tlb_all();
16962 +
16963 +       balloon_unlock(flags);
16964 +
16965 +       return -ENOMEM;
16966 +}
16967 +
16968 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
16969 +{
16970 +       pgd_t         *pgd; 
16971 +       pud_t         *pud; 
16972 +       pmd_t         *pmd;
16973 +       pte_t         *pte;
16974 +       unsigned long  frame, i, flags;
16975 +       struct xen_memory_reservation reservation = {
16976 +               .extent_start = &frame,
16977 +               .nr_extents   = 1,
16978 +               .extent_order = 0,
16979 +               .domid        = DOMID_SELF
16980 +       };
16981 +
16982 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16983 +               return;
16984 +
16985 +       scrub_pages(vstart, 1 << order);
16986 +
16987 +       balloon_lock(flags);
16988 +
16989 +       contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
16990 +
16991 +       /* 1. Zap current PTEs, giving away the underlying pages. */
16992 +       for (i = 0; i < (1<<order); i++) {
16993 +               pgd = pgd_offset_k(vstart + (i*PAGE_SIZE));
16994 +               pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
16995 +               pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
16996 +               pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
16997 +               frame = pte_mfn(*pte);
16998 +               BUG_ON(HYPERVISOR_update_va_mapping(
16999 +                       vstart + (i*PAGE_SIZE), __pte_ma(0), 0));
17000 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17001 +                       INVALID_P2M_ENTRY);
17002 +               BUG_ON(HYPERVISOR_memory_op(
17003 +                       XENMEM_decrease_reservation, &reservation) != 1);
17004 +       }
17005 +
17006 +       /* 2. Map new pages in place of old pages. */
17007 +       for (i = 0; i < (1<<order); i++) {
17008 +               frame = (__pa(vstart) >> PAGE_SHIFT) + i;
17009 +               BUG_ON(HYPERVISOR_memory_op(
17010 +                       XENMEM_populate_physmap, &reservation) != 1);
17011 +               BUG_ON(HYPERVISOR_update_va_mapping(
17012 +                       vstart + (i*PAGE_SIZE),
17013 +                       pfn_pte_ma(frame, PAGE_KERNEL), 0));
17014 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17015 +       }
17016 +
17017 +       flush_tlb_all();
17018 +
17019 +       balloon_unlock(flags);
17020 +}
17021 +
17022 +#ifdef __i386__
17023 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
17024 +{
17025 +       __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
17026 +       maddr_t mach_lp = arbitrary_virt_to_machine(lp);
17027 +       return HYPERVISOR_update_descriptor(
17028 +               mach_lp, (u64)entry_a | ((u64)entry_b<<32));
17029 +}
17030 +#endif
17031 +
17032 +/*
17033 + * Local variables:
17034 + *  c-file-style: "linux"
17035 + *  indent-tabs-mode: t
17036 + *  c-indent-level: 8
17037 + *  c-basic-offset: 8
17038 + *  tab-width: 8
17039 + * End:
17040 + */
17041 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/init-xen.c tmp-linux-2.6-xen.patch/arch/i386/mm/init-xen.c
17042 --- ref-linux-2.6.16.9/arch/i386/mm/init-xen.c  1970-01-01 01:00:00.000000000 +0100
17043 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/init-xen.c     2006-04-10 00:05:52.000000000 +0200
17044 @@ -0,0 +1,854 @@
17045 +/*
17046 + *  linux/arch/i386/mm/init.c
17047 + *
17048 + *  Copyright (C) 1995  Linus Torvalds
17049 + *
17050 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
17051 + */
17052 +
17053 +#include <linux/config.h>
17054 +#include <linux/module.h>
17055 +#include <linux/signal.h>
17056 +#include <linux/sched.h>
17057 +#include <linux/kernel.h>
17058 +#include <linux/errno.h>
17059 +#include <linux/string.h>
17060 +#include <linux/types.h>
17061 +#include <linux/ptrace.h>
17062 +#include <linux/mman.h>
17063 +#include <linux/mm.h>
17064 +#include <linux/hugetlb.h>
17065 +#include <linux/swap.h>
17066 +#include <linux/smp.h>
17067 +#include <linux/init.h>
17068 +#include <linux/highmem.h>
17069 +#include <linux/pagemap.h>
17070 +#include <linux/bootmem.h>
17071 +#include <linux/slab.h>
17072 +#include <linux/proc_fs.h>
17073 +#include <linux/efi.h>
17074 +#include <linux/memory_hotplug.h>
17075 +#include <linux/initrd.h>
17076 +#include <linux/dma-mapping.h>
17077 +#include <linux/scatterlist.h>
17078 +
17079 +#include <asm/processor.h>
17080 +#include <asm/system.h>
17081 +#include <asm/uaccess.h>
17082 +#include <asm/pgtable.h>
17083 +#include <asm/dma.h>
17084 +#include <asm/fixmap.h>
17085 +#include <asm/e820.h>
17086 +#include <asm/apic.h>
17087 +#include <asm/tlb.h>
17088 +#include <asm/tlbflush.h>
17089 +#include <asm/sections.h>
17090 +#include <asm/hypervisor.h>
17091 +#include <asm/swiotlb.h>
17092 +
17093 +extern unsigned long *contiguous_bitmap;
17094 +
17095 +unsigned int __VMALLOC_RESERVE = 128 << 20;
17096 +
17097 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
17098 +unsigned long highstart_pfn, highend_pfn;
17099 +
17100 +static int noinline do_test_wp_bit(void);
17101 +
17102 +/*
17103 + * Creates a middle page table and puts a pointer to it in the
17104 + * given global directory entry. This only returns the gd entry
17105 + * in non-PAE compilation mode, since the middle layer is folded.
17106 + */
17107 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
17108 +{
17109 +       pud_t *pud;
17110 +       pmd_t *pmd_table;
17111 +               
17112 +#ifdef CONFIG_X86_PAE
17113 +       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
17114 +       make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
17115 +       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
17116 +       pud = pud_offset(pgd, 0);
17117 +       if (pmd_table != pmd_offset(pud, 0)) 
17118 +               BUG();
17119 +#else
17120 +       pud = pud_offset(pgd, 0);
17121 +       pmd_table = pmd_offset(pud, 0);
17122 +#endif
17123 +
17124 +       return pmd_table;
17125 +}
17126 +
17127 +/*
17128 + * Create a page table and place a pointer to it in a middle page
17129 + * directory entry.
17130 + */
17131 +static pte_t * __init one_page_table_init(pmd_t *pmd)
17132 +{
17133 +       if (pmd_none(*pmd)) {
17134 +               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
17135 +               make_lowmem_page_readonly(page_table,
17136 +                                         XENFEAT_writable_page_tables);
17137 +               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
17138 +               if (page_table != pte_offset_kernel(pmd, 0))
17139 +                       BUG();  
17140 +
17141 +               return page_table;
17142 +       }
17143 +       
17144 +       return pte_offset_kernel(pmd, 0);
17145 +}
17146 +
17147 +/*
17148 + * This function initializes a certain range of kernel virtual memory 
17149 + * with new bootmem page tables, everywhere page tables are missing in
17150 + * the given range.
17151 + */
17152 +
17153 +/*
17154 + * NOTE: The pagetables are allocated contiguous on the physical space 
17155 + * so we can cache the place of the first one and move around without 
17156 + * checking the pgd every time.
17157 + */
17158 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
17159 +{
17160 +       pgd_t *pgd;
17161 +       pud_t *pud;
17162 +       pmd_t *pmd;
17163 +       int pgd_idx, pmd_idx;
17164 +       unsigned long vaddr;
17165 +
17166 +       vaddr = start;
17167 +       pgd_idx = pgd_index(vaddr);
17168 +       pmd_idx = pmd_index(vaddr);
17169 +       pgd = pgd_base + pgd_idx;
17170 +
17171 +       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
17172 +               if (pgd_none(*pgd)) 
17173 +                       one_md_table_init(pgd);
17174 +               pud = pud_offset(pgd, vaddr);
17175 +               pmd = pmd_offset(pud, vaddr);
17176 +               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
17177 +                       if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) 
17178 +                               one_page_table_init(pmd);
17179 +
17180 +                       vaddr += PMD_SIZE;
17181 +               }
17182 +               pmd_idx = 0;
17183 +       }
17184 +}
17185 +
17186 +static inline int is_kernel_text(unsigned long addr)
17187 +{
17188 +       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
17189 +               return 1;
17190 +       return 0;
17191 +}
17192 +
17193 +/*
17194 + * This maps the physical memory to kernel virtual address space, a total 
17195 + * of max_low_pfn pages, by creating page tables starting from address 
17196 + * PAGE_OFFSET.
17197 + */
17198 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
17199 +{
17200 +       unsigned long pfn;
17201 +       pgd_t *pgd;
17202 +       pmd_t *pmd;
17203 +       pte_t *pte;
17204 +       int pgd_idx, pmd_idx, pte_ofs;
17205 +
17206 +       unsigned long max_ram_pfn = xen_start_info->nr_pages;
17207 +       if (max_ram_pfn > max_low_pfn)
17208 +               max_ram_pfn = max_low_pfn;
17209 +
17210 +       pgd_idx = pgd_index(PAGE_OFFSET);
17211 +       pgd = pgd_base + pgd_idx;
17212 +       pfn = 0;
17213 +       pmd_idx = pmd_index(PAGE_OFFSET);
17214 +       pte_ofs = pte_index(PAGE_OFFSET);
17215 +
17216 +       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
17217 +#ifdef CONFIG_XEN
17218 +               /*
17219 +                * Native linux hasn't PAE-paging enabled yet at this
17220 +                * point.  When running as xen domain we are in PAE
17221 +                * mode already, thus we can't simply hook a empty
17222 +                * pmd.  That would kill the mappings we are currently
17223 +                * using ...
17224 +                */
17225 +               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
17226 +#else
17227 +               pmd = one_md_table_init(pgd);
17228 +#endif
17229 +               if (pfn >= max_low_pfn)
17230 +                       continue;
17231 +               pmd += pmd_idx;
17232 +               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
17233 +                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
17234 +                       if (address >= HYPERVISOR_VIRT_START)
17235 +                               continue;
17236 +
17237 +                       /* Map with big pages if possible, otherwise create normal page tables. */
17238 +                       if (cpu_has_pse) {
17239 +                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
17240 +
17241 +                               if (is_kernel_text(address) || is_kernel_text(address2))
17242 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
17243 +                               else
17244 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
17245 +                               pfn += PTRS_PER_PTE;
17246 +                       } else {
17247 +                               pte = one_page_table_init(pmd);
17248 +
17249 +                               pte += pte_ofs;
17250 +                               for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
17251 +                                               /* XEN: Only map initial RAM allocation. */
17252 +                                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
17253 +                                                       continue;
17254 +                                               if (is_kernel_text(address))
17255 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
17256 +                                               else
17257 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
17258 +                               }
17259 +                               pte_ofs = 0;
17260 +                       }
17261 +               }
17262 +               pmd_idx = 0;
17263 +       }
17264 +}
17265 +
17266 +#ifndef CONFIG_XEN
17267 +
17268 +static inline int page_kills_ppro(unsigned long pagenr)
17269 +{
17270 +       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
17271 +               return 1;
17272 +       return 0;
17273 +}
17274 +
17275 +extern int is_available_memory(efi_memory_desc_t *);
17276 +
17277 +int page_is_ram(unsigned long pagenr)
17278 +{
17279 +       int i;
17280 +       unsigned long addr, end;
17281 +
17282 +       if (efi_enabled) {
17283 +               efi_memory_desc_t *md;
17284 +               void *p;
17285 +
17286 +               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
17287 +                       md = p;
17288 +                       if (!is_available_memory(md))
17289 +                               continue;
17290 +                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
17291 +                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
17292 +
17293 +                       if ((pagenr >= addr) && (pagenr < end))
17294 +                               return 1;
17295 +               }
17296 +               return 0;
17297 +       }
17298 +
17299 +       for (i = 0; i < e820.nr_map; i++) {
17300 +
17301 +               if (e820.map[i].type != E820_RAM)       /* not usable memory */
17302 +                       continue;
17303 +               /*
17304 +                *      !!!FIXME!!! Some BIOSen report areas as RAM that
17305 +                *      are not. Notably the 640->1Mb area. We need a sanity
17306 +                *      check here.
17307 +                */
17308 +               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
17309 +               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
17310 +               if  ((pagenr >= addr) && (pagenr < end))
17311 +                       return 1;
17312 +       }
17313 +       return 0;
17314 +}
17315 +
17316 +#else /* CONFIG_XEN */
17317 +
17318 +#define page_kills_ppro(p)     0
17319 +#define page_is_ram(p)         1
17320 +
17321 +#endif
17322 +
17323 +#ifdef CONFIG_HIGHMEM
17324 +pte_t *kmap_pte;
17325 +pgprot_t kmap_prot;
17326 +
17327 +#define kmap_get_fixmap_pte(vaddr)                                     \
17328 +       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
17329 +
17330 +static void __init kmap_init(void)
17331 +{
17332 +       unsigned long kmap_vstart;
17333 +
17334 +       /* cache the first kmap pte */
17335 +       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
17336 +       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
17337 +
17338 +       kmap_prot = PAGE_KERNEL;
17339 +}
17340 +
17341 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
17342 +{
17343 +       pgd_t *pgd;
17344 +       pud_t *pud;
17345 +       pmd_t *pmd;
17346 +       pte_t *pte;
17347 +       unsigned long vaddr;
17348 +
17349 +       vaddr = PKMAP_BASE;
17350 +       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
17351 +
17352 +       pgd = swapper_pg_dir + pgd_index(vaddr);
17353 +       pud = pud_offset(pgd, vaddr);
17354 +       pmd = pmd_offset(pud, vaddr);
17355 +       pte = pte_offset_kernel(pmd, vaddr);
17356 +       pkmap_page_table = pte; 
17357 +}
17358 +
17359 +static void __meminit free_new_highpage(struct page *page, int pfn)
17360 +{
17361 +       set_page_count(page, 1);
17362 +       if (pfn < xen_start_info->nr_pages)
17363 +               __free_page(page);
17364 +       totalhigh_pages++;
17365 +}
17366 +
17367 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
17368 +{
17369 +       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
17370 +               ClearPageReserved(page);
17371 +               free_new_highpage(page, pfn);
17372 +       } else
17373 +               SetPageReserved(page);
17374 +}
17375 +
17376 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
17377 +{
17378 +       free_new_highpage(page, pfn);
17379 +       totalram_pages++;
17380 +#ifdef CONFIG_FLATMEM
17381 +       max_mapnr = max(pfn, max_mapnr);
17382 +#endif
17383 +       num_physpages++;
17384 +       return 0;
17385 +}
17386 +
17387 +/*
17388 + * Not currently handling the NUMA case.
17389 + * Assuming single node and all memory that
17390 + * has been added dynamically that would be
17391 + * onlined here is in HIGHMEM
17392 + */
17393 +void online_page(struct page *page)
17394 +{
17395 +       ClearPageReserved(page);
17396 +       add_one_highpage_hotplug(page, page_to_pfn(page));
17397 +}
17398 +
17399 +
17400 +#ifdef CONFIG_NUMA
17401 +extern void set_highmem_pages_init(int);
17402 +#else
17403 +static void __init set_highmem_pages_init(int bad_ppro)
17404 +{
17405 +       int pfn;
17406 +       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
17407 +               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
17408 +       totalram_pages += totalhigh_pages;
17409 +}
17410 +#endif /* CONFIG_FLATMEM */
17411 +
17412 +#else
17413 +#define kmap_init() do { } while (0)
17414 +#define permanent_kmaps_init(pgd_base) do { } while (0)
17415 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
17416 +#endif /* CONFIG_HIGHMEM */
17417 +
17418 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
17419 +EXPORT_SYMBOL(__PAGE_KERNEL);
17420 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
17421 +
17422 +#ifdef CONFIG_NUMA
17423 +extern void __init remap_numa_kva(void);
17424 +#else
17425 +#define remap_numa_kva() do {} while (0)
17426 +#endif
17427 +
17428 +pgd_t *swapper_pg_dir;
17429 +
17430 +static void __init pagetable_init (void)
17431 +{
17432 +       unsigned long vaddr;
17433 +       pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
17434 +
17435 +       swapper_pg_dir = pgd_base;
17436 +       init_mm.pgd    = pgd_base;
17437 +
17438 +       /* Enable PSE if available */
17439 +       if (cpu_has_pse) {
17440 +               set_in_cr4(X86_CR4_PSE);
17441 +       }
17442 +
17443 +       /* Enable PGE if available */
17444 +       if (cpu_has_pge) {
17445 +               set_in_cr4(X86_CR4_PGE);
17446 +               __PAGE_KERNEL |= _PAGE_GLOBAL;
17447 +               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
17448 +       }
17449 +
17450 +       kernel_physical_mapping_init(pgd_base);
17451 +       remap_numa_kva();
17452 +
17453 +       /*
17454 +        * Fixed mappings, only the page table structure has to be
17455 +        * created - mappings will be set by set_fixmap():
17456 +        */
17457 +       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
17458 +       page_table_range_init(vaddr, 0, pgd_base);
17459 +
17460 +       permanent_kmaps_init(pgd_base);
17461 +}
17462 +
17463 +#ifdef CONFIG_SOFTWARE_SUSPEND
17464 +/*
17465 + * Swap suspend & friends need this for resume because things like the intel-agp
17466 + * driver might have split up a kernel 4MB mapping.
17467 + */
17468 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
17469 +       __attribute__ ((aligned (PAGE_SIZE)));
17470 +
17471 +static inline void save_pg_dir(void)
17472 +{
17473 +       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
17474 +}
17475 +#else
17476 +static inline void save_pg_dir(void)
17477 +{
17478 +}
17479 +#endif
17480 +
17481 +void zap_low_mappings (void)
17482 +{
17483 +       int i;
17484 +
17485 +       save_pg_dir();
17486 +
17487 +       /*
17488 +        * Zap initial low-memory mappings.
17489 +        *
17490 +        * Note that "pgd_clear()" doesn't do it for
17491 +        * us, because pgd_clear() is a no-op on i386.
17492 +        */
17493 +       for (i = 0; i < USER_PTRS_PER_PGD; i++)
17494 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
17495 +               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
17496 +#else
17497 +               set_pgd(swapper_pg_dir+i, __pgd(0));
17498 +#endif
17499 +       flush_tlb_all();
17500 +}
17501 +
17502 +static int disable_nx __initdata = 0;
17503 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
17504 +EXPORT_SYMBOL(__supported_pte_mask);
17505 +
17506 +/*
17507 + * noexec = on|off
17508 + *
17509 + * Control non executable mappings.
17510 + *
17511 + * on      Enable
17512 + * off     Disable
17513 + */
17514 +void __init noexec_setup(const char *str)
17515 +{
17516 +       if (!strncmp(str, "on",2) && cpu_has_nx) {
17517 +               __supported_pte_mask |= _PAGE_NX;
17518 +               disable_nx = 0;
17519 +       } else if (!strncmp(str,"off",3)) {
17520 +               disable_nx = 1;
17521 +               __supported_pte_mask &= ~_PAGE_NX;
17522 +       }
17523 +}
17524 +
17525 +int nx_enabled = 0;
17526 +#ifdef CONFIG_X86_PAE
17527 +
17528 +static void __init set_nx(void)
17529 +{
17530 +       unsigned int v[4], l, h;
17531 +
17532 +       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
17533 +               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
17534 +               if ((v[3] & (1 << 20)) && !disable_nx) {
17535 +                       rdmsr(MSR_EFER, l, h);
17536 +                       l |= EFER_NX;
17537 +                       wrmsr(MSR_EFER, l, h);
17538 +                       nx_enabled = 1;
17539 +                       __supported_pte_mask |= _PAGE_NX;
17540 +               }
17541 +       }
17542 +}
17543 +
17544 +/*
17545 + * Enables/disables executability of a given kernel page and
17546 + * returns the previous setting.
17547 + */
17548 +int __init set_kernel_exec(unsigned long vaddr, int enable)
17549 +{
17550 +       pte_t *pte;
17551 +       int ret = 1;
17552 +
17553 +       if (!nx_enabled)
17554 +               goto out;
17555 +
17556 +       pte = lookup_address(vaddr);
17557 +       BUG_ON(!pte);
17558 +
17559 +       if (!pte_exec_kernel(*pte))
17560 +               ret = 0;
17561 +
17562 +       if (enable)
17563 +               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
17564 +       else
17565 +               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
17566 +       __flush_tlb_all();
17567 +out:
17568 +       return ret;
17569 +}
17570 +
17571 +#endif
17572 +
17573 +/*
17574 + * paging_init() sets up the page tables - note that the first 8MB are
17575 + * already mapped by head.S.
17576 + *
17577 + * This routines also unmaps the page at virtual kernel address 0, so
17578 + * that we can trap those pesky NULL-reference errors in the kernel.
17579 + */
17580 +void __init paging_init(void)
17581 +{
17582 +       int i;
17583 +
17584 +#ifdef CONFIG_X86_PAE
17585 +       set_nx();
17586 +       if (nx_enabled)
17587 +               printk("NX (Execute Disable) protection: active\n");
17588 +#endif
17589 +
17590 +       pagetable_init();
17591 +
17592 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
17593 +       /*
17594 +        * We will bail out later - printk doesn't work right now so
17595 +        * the user would just see a hanging kernel.
17596 +        * when running as xen domain we are already in PAE mode at
17597 +        * this point.
17598 +        */
17599 +       if (cpu_has_pae)
17600 +               set_in_cr4(X86_CR4_PAE);
17601 +#endif
17602 +       __flush_tlb_all();
17603 +
17604 +       kmap_init();
17605 +
17606 +       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
17607 +           xen_start_info->shared_info >= xen_start_info->nr_pages) {
17608 +               /* Switch to the real shared_info page, and clear the
17609 +                * dummy page. */
17610 +               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17611 +               HYPERVISOR_shared_info =
17612 +                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17613 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
17614 +       }
17615 +
17616 +       /* Setup mapping of lower 1st MB */
17617 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
17618 +               if (xen_start_info->flags & SIF_PRIVILEGED)
17619 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17620 +               else
17621 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
17622 +                                    virt_to_machine(empty_zero_page),
17623 +                                    PAGE_KERNEL_RO);
17624 +}
17625 +
17626 +/*
17627 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
17628 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
17629 + * used to involve black magic jumps to work around some nasty CPU bugs,
17630 + * but fortunately the switch to using exceptions got rid of all that.
17631 + */
17632 +
17633 +static void __init test_wp_bit(void)
17634 +{
17635 +       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
17636 +
17637 +       /* Any page-aligned address will do, the test is non-destructive */
17638 +       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
17639 +       boot_cpu_data.wp_works_ok = do_test_wp_bit();
17640 +       clear_fixmap(FIX_WP_TEST);
17641 +
17642 +       if (!boot_cpu_data.wp_works_ok) {
17643 +               printk("No.\n");
17644 +#ifdef CONFIG_X86_WP_WORKS_OK
17645 +               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
17646 +#endif
17647 +       } else {
17648 +               printk("Ok.\n");
17649 +       }
17650 +}
17651 +
17652 +static void __init set_max_mapnr_init(void)
17653 +{
17654 +#ifdef CONFIG_HIGHMEM
17655 +       num_physpages = highend_pfn;
17656 +#else
17657 +       num_physpages = max_low_pfn;
17658 +#endif
17659 +#ifdef CONFIG_FLATMEM
17660 +       max_mapnr = num_physpages;
17661 +#endif
17662 +}
17663 +
17664 +static struct kcore_list kcore_mem, kcore_vmalloc; 
17665 +
17666 +void __init mem_init(void)
17667 +{
17668 +       extern int ppro_with_ram_bug(void);
17669 +       int codesize, reservedpages, datasize, initsize;
17670 +       int tmp;
17671 +       int bad_ppro;
17672 +       unsigned long pfn;
17673 +
17674 +       contiguous_bitmap = alloc_bootmem_low_pages(
17675 +               (max_low_pfn + 2*BITS_PER_LONG) >> 3);
17676 +       BUG_ON(!contiguous_bitmap);
17677 +       memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3);
17678 +
17679 +#if defined(CONFIG_SWIOTLB)
17680 +       swiotlb_init(); 
17681 +#endif
17682 +
17683 +#ifdef CONFIG_FLATMEM
17684 +       if (!mem_map)
17685 +               BUG();
17686 +#endif
17687 +       
17688 +       bad_ppro = ppro_with_ram_bug();
17689 +
17690 +#ifdef CONFIG_HIGHMEM
17691 +       /* check that fixmap and pkmap do not overlap */
17692 +       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
17693 +               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
17694 +               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
17695 +                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
17696 +               BUG();
17697 +       }
17698 +#endif
17699
17700 +       set_max_mapnr_init();
17701 +
17702 +#ifdef CONFIG_HIGHMEM
17703 +       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
17704 +#else
17705 +       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
17706 +#endif
17707 +       printk("vmalloc area: %lx-%lx, maxmem %lx\n",
17708 +              VMALLOC_START,VMALLOC_END,MAXMEM);
17709 +       BUG_ON(VMALLOC_START > VMALLOC_END);
17710 +       
17711 +       /* this will put all low memory onto the freelists */
17712 +       totalram_pages += free_all_bootmem();
17713 +       /* XEN: init and count low-mem pages outside initial allocation. */
17714 +       for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
17715 +               ClearPageReserved(&mem_map[pfn]);
17716 +               set_page_count(&mem_map[pfn], 1);
17717 +               totalram_pages++;
17718 +       }
17719 +
17720 +       reservedpages = 0;
17721 +       for (tmp = 0; tmp < max_low_pfn; tmp++)
17722 +               /*
17723 +                * Only count reserved RAM pages
17724 +                */
17725 +               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
17726 +                       reservedpages++;
17727 +
17728 +       set_highmem_pages_init(bad_ppro);
17729 +
17730 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
17731 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
17732 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
17733 +
17734 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
17735 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
17736 +                  VMALLOC_END-VMALLOC_START);
17737 +
17738 +       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
17739 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
17740 +               num_physpages << (PAGE_SHIFT-10),
17741 +               codesize >> 10,
17742 +               reservedpages << (PAGE_SHIFT-10),
17743 +               datasize >> 10,
17744 +               initsize >> 10,
17745 +               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
17746 +              );
17747 +
17748 +#ifdef CONFIG_X86_PAE
17749 +       if (!cpu_has_pae)
17750 +               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
17751 +#endif
17752 +       if (boot_cpu_data.wp_works_ok < 0)
17753 +               test_wp_bit();
17754 +
17755 +       /*
17756 +        * Subtle. SMP is doing it's boot stuff late (because it has to
17757 +        * fork idle threads) - but it also needs low mappings for the
17758 +        * protected-mode entry to work. We zap these entries only after
17759 +        * the WP-bit has been tested.
17760 +        */
17761 +#ifndef CONFIG_SMP
17762 +       zap_low_mappings();
17763 +#endif
17764 +
17765 +       set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
17766 +}
17767 +
17768 +/*
17769 + * this is for the non-NUMA, single node SMP system case.
17770 + * Specifically, in the case of x86, we will always add
17771 + * memory to the highmem for now.
17772 + */
17773 +#ifndef CONFIG_NEED_MULTIPLE_NODES
17774 +int add_memory(u64 start, u64 size)
17775 +{
17776 +       struct pglist_data *pgdata = &contig_page_data;
17777 +       struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
17778 +       unsigned long start_pfn = start >> PAGE_SHIFT;
17779 +       unsigned long nr_pages = size >> PAGE_SHIFT;
17780 +
17781 +       return __add_pages(zone, start_pfn, nr_pages);
17782 +}
17783 +
17784 +int remove_memory(u64 start, u64 size)
17785 +{
17786 +       return -EINVAL;
17787 +}
17788 +#endif
17789 +
17790 +kmem_cache_t *pgd_cache;
17791 +kmem_cache_t *pmd_cache;
17792 +
17793 +void __init pgtable_cache_init(void)
17794 +{
17795 +       if (PTRS_PER_PMD > 1) {
17796 +               pmd_cache = kmem_cache_create("pmd",
17797 +                                       PTRS_PER_PMD*sizeof(pmd_t),
17798 +                                       PTRS_PER_PMD*sizeof(pmd_t),
17799 +                                       0,
17800 +                                       pmd_ctor,
17801 +                                       NULL);
17802 +               if (!pmd_cache)
17803 +                       panic("pgtable_cache_init(): cannot create pmd cache");
17804 +       }
17805 +       pgd_cache = kmem_cache_create("pgd",
17806 +#ifndef CONFIG_XEN
17807 +                               PTRS_PER_PGD*sizeof(pgd_t),
17808 +                               PTRS_PER_PGD*sizeof(pgd_t),
17809 +#else
17810 +                               PAGE_SIZE,
17811 +                               PAGE_SIZE,
17812 +#endif
17813 +                               0,
17814 +                               pgd_ctor,
17815 +                               pgd_dtor);
17816 +       if (!pgd_cache)
17817 +               panic("pgtable_cache_init(): Cannot create pgd cache");
17818 +}
17819 +
17820 +/*
17821 + * This function cannot be __init, since exceptions don't work in that
17822 + * section.  Put this after the callers, so that it cannot be inlined.
17823 + */
17824 +static int noinline do_test_wp_bit(void)
17825 +{
17826 +       char tmp_reg;
17827 +       int flag;
17828 +
17829 +       __asm__ __volatile__(
17830 +               "       movb %0,%1      \n"
17831 +               "1:     movb %1,%0      \n"
17832 +               "       xorl %2,%2      \n"
17833 +               "2:                     \n"
17834 +               ".section __ex_table,\"a\"\n"
17835 +               "       .align 4        \n"
17836 +               "       .long 1b,2b     \n"
17837 +               ".previous              \n"
17838 +               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
17839 +                "=q" (tmp_reg),
17840 +                "=r" (flag)
17841 +               :"2" (1)
17842 +               :"memory");
17843 +       
17844 +       return flag;
17845 +}
17846 +
17847 +void free_initmem(void)
17848 +{
17849 +       unsigned long addr;
17850 +
17851 +       addr = (unsigned long)(&__init_begin);
17852 +       for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
17853 +               ClearPageReserved(virt_to_page(addr));
17854 +               set_page_count(virt_to_page(addr), 1);
17855 +               memset((void *)addr, 0xcc, PAGE_SIZE);
17856 +               free_page(addr);
17857 +               totalram_pages++;
17858 +       }
17859 +       printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
17860 +}
17861 +
17862 +#ifdef CONFIG_DEBUG_RODATA
17863 +
17864 +extern char __start_rodata, __end_rodata;
17865 +void mark_rodata_ro(void)
17866 +{
17867 +       unsigned long addr = (unsigned long)&__start_rodata;
17868 +
17869 +       for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
17870 +               change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
17871 +
17872 +       printk ("Write protecting the kernel read-only data: %luk\n",
17873 +                       (unsigned long)(&__end_rodata - &__start_rodata) >> 10);
17874 +
17875 +       /*
17876 +        * change_page_attr() requires a global_flush_tlb() call after it.
17877 +        * We do this after the printk so that if something went wrong in the
17878 +        * change, the printk gets out at least to give a better debug hint
17879 +        * of who is the culprit.
17880 +        */
17881 +       global_flush_tlb();
17882 +}
17883 +#endif
17884 +
17885 +
17886 +#ifdef CONFIG_BLK_DEV_INITRD
17887 +void free_initrd_mem(unsigned long start, unsigned long end)
17888 +{
17889 +       if (start < end)
17890 +               printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
17891 +       for (; start < end; start += PAGE_SIZE) {
17892 +               ClearPageReserved(virt_to_page(start));
17893 +               set_page_count(virt_to_page(start), 1);
17894 +               free_page(start);
17895 +               totalram_pages++;
17896 +       }
17897 +}
17898 +#endif
17899 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/ioremap-xen.c tmp-linux-2.6-xen.patch/arch/i386/mm/ioremap-xen.c
17900 --- ref-linux-2.6.16.9/arch/i386/mm/ioremap-xen.c       1970-01-01 01:00:00.000000000 +0100
17901 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/ioremap-xen.c  2006-04-10 00:05:52.000000000 +0200
17902 @@ -0,0 +1,464 @@
17903 +/*
17904 + * arch/i386/mm/ioremap.c
17905 + *
17906 + * Re-map IO memory to kernel address space so that we can access it.
17907 + * This is needed for high PCI addresses that aren't mapped in the
17908 + * 640k-1MB IO memory area on PC's
17909 + *
17910 + * (C) Copyright 1995 1996 Linus Torvalds
17911 + */
17912 +
17913 +#include <linux/vmalloc.h>
17914 +#include <linux/init.h>
17915 +#include <linux/slab.h>
17916 +#include <linux/module.h>
17917 +#include <asm/io.h>
17918 +#include <asm/fixmap.h>
17919 +#include <asm/cacheflush.h>
17920 +#include <asm/tlbflush.h>
17921 +#include <asm/pgtable.h>
17922 +#include <asm/pgalloc.h>
17923 +
17924 +#define ISA_START_ADDRESS      0x0
17925 +#define ISA_END_ADDRESS                0x100000
17926 +
17927 +#if 0 /* not PAE safe */
17928 +/* These hacky macros avoid phys->machine translations. */
17929 +#define __direct_pte(x) ((pte_t) { (x) } )
17930 +#define __direct_mk_pte(page_nr,pgprot) \
17931 +  __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
17932 +#define direct_mk_pte_phys(physpage, pgprot) \
17933 +  __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
17934 +#endif
17935 +
17936 +static int direct_remap_area_pte_fn(pte_t *pte, 
17937 +                                   struct page *pmd_page,
17938 +                                   unsigned long address, 
17939 +                                   void *data)
17940 +{
17941 +       mmu_update_t **v = (mmu_update_t **)data;
17942 +
17943 +       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
17944 +                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
17945 +       (*v)++;
17946 +
17947 +       return 0;
17948 +}
17949 +
17950 +static int __direct_remap_pfn_range(struct mm_struct *mm,
17951 +                                   unsigned long address, 
17952 +                                   unsigned long mfn,
17953 +                                   unsigned long size, 
17954 +                                   pgprot_t prot,
17955 +                                   domid_t  domid)
17956 +{
17957 +       int rc;
17958 +       unsigned long i, start_address;
17959 +       mmu_update_t *u, *v, *w;
17960 +
17961 +       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
17962 +       if (u == NULL)
17963 +               return -ENOMEM;
17964 +
17965 +       start_address = address;
17966 +
17967 +       flush_cache_all();
17968 +
17969 +       for (i = 0; i < size; i += PAGE_SIZE) {
17970 +               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
17971 +                       /* Fill in the PTE pointers. */
17972 +                       rc = apply_to_page_range(mm, start_address, 
17973 +                                                address - start_address,
17974 +                                                direct_remap_area_pte_fn, &w);
17975 +                       if (rc)
17976 +                               goto out;
17977 +                       w = u;
17978 +                       rc = -EFAULT;
17979 +                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
17980 +                               goto out;
17981 +                       v = u;
17982 +                       start_address = address;
17983 +               }
17984 +
17985 +               /*
17986 +                * Fill in the machine address: PTE ptr is done later by
17987 +                * __direct_remap_area_pages(). 
17988 +                */
17989 +               v->val = pte_val_ma(pfn_pte_ma(mfn, prot));
17990 +
17991 +               mfn++;
17992 +               address += PAGE_SIZE; 
17993 +               v++;
17994 +       }
17995 +
17996 +       if (v != u) {
17997 +               /* get the ptep's filled in */
17998 +               rc = apply_to_page_range(mm, start_address,
17999 +                                        address - start_address,
18000 +                                        direct_remap_area_pte_fn, &w);
18001 +               if (rc)
18002 +                       goto out;
18003 +               rc = -EFAULT;
18004 +               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
18005 +                       goto out;
18006 +       }
18007 +
18008 +       rc = 0;
18009 +
18010 + out:
18011 +       flush_tlb_all();
18012 +
18013 +       free_page((unsigned long)u);
18014 +
18015 +       return rc;
18016 +}
18017 +
18018 +int direct_remap_pfn_range(struct vm_area_struct *vma,
18019 +                          unsigned long address, 
18020 +                          unsigned long mfn,
18021 +                          unsigned long size, 
18022 +                          pgprot_t prot,
18023 +                          domid_t  domid)
18024 +{
18025 +       /* Same as remap_pfn_range(). */
18026 +       vma->vm_flags |= VM_IO | VM_RESERVED;
18027 +
18028 +       if (domid == DOMID_SELF)
18029 +               return -EINVAL;
18030 +
18031 +       return __direct_remap_pfn_range(
18032 +               vma->vm_mm, address, mfn, size, prot, domid);
18033 +}
18034 +EXPORT_SYMBOL(direct_remap_pfn_range);
18035 +
18036 +int direct_kernel_remap_pfn_range(unsigned long address, 
18037 +                                 unsigned long mfn,
18038 +                                 unsigned long size, 
18039 +                                 pgprot_t prot,
18040 +                                 domid_t  domid)
18041 +{
18042 +       return __direct_remap_pfn_range(
18043 +               &init_mm, address, mfn, size, prot, domid);
18044 +}
18045 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
18046 +
18047 +static int lookup_pte_fn(
18048 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18049 +{
18050 +       uint64_t *ptep = (uint64_t *)data;
18051 +       if (ptep)
18052 +               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18053 +                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18054 +       return 0;
18055 +}
18056 +
18057 +int create_lookup_pte_addr(struct mm_struct *mm, 
18058 +                          unsigned long address,
18059 +                          uint64_t *ptep)
18060 +{
18061 +       return apply_to_page_range(mm, address, PAGE_SIZE,
18062 +                                  lookup_pte_fn, ptep);
18063 +}
18064 +
18065 +EXPORT_SYMBOL(create_lookup_pte_addr);
18066 +
18067 +static int noop_fn(
18068 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18069 +{
18070 +       return 0;
18071 +}
18072 +
18073 +int touch_pte_range(struct mm_struct *mm,
18074 +                   unsigned long address,
18075 +                   unsigned long size)
18076 +{
18077 +       return apply_to_page_range(mm, address, size, noop_fn, NULL);
18078 +} 
18079 +
18080 +EXPORT_SYMBOL(touch_pte_range);
18081 +
18082 +/*
18083 + * Does @address reside within a non-highmem page that is local to this virtual
18084 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
18085 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
18086 + * why this works.
18087 + */
18088 +static inline int is_local_lowmem(unsigned long address)
18089 +{
18090 +       extern unsigned long max_low_pfn;
18091 +       return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
18092 +}
18093 +
18094 +/*
18095 + * Generic mapping function (not visible outside):
18096 + */
18097 +
18098 +/*
18099 + * Remap an arbitrary physical address space into the kernel virtual
18100 + * address space. Needed when the kernel wants to access high addresses
18101 + * directly.
18102 + *
18103 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
18104 + * have to convert them into an offset in a page-aligned mapping, but the
18105 + * caller shouldn't need to know that small detail.
18106 + */
18107 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
18108 +{
18109 +       void __iomem * addr;
18110 +       struct vm_struct * area;
18111 +       unsigned long offset, last_addr;
18112 +       domid_t domid = DOMID_IO;
18113 +
18114 +       /* Don't allow wraparound or zero size */
18115 +       last_addr = phys_addr + size - 1;
18116 +       if (!size || last_addr < phys_addr)
18117 +               return NULL;
18118 +
18119 +       /*
18120 +        * Don't remap the low PCI/ISA area, it's always mapped..
18121 +        */
18122 +       if (xen_start_info->flags & SIF_PRIVILEGED &&
18123 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
18124 +               return (void __iomem *) isa_bus_to_virt(phys_addr);
18125 +
18126 +       /*
18127 +        * Don't allow anybody to remap normal RAM that we're using..
18128 +        */
18129 +       if (is_local_lowmem(phys_addr)) {
18130 +               char *t_addr, *t_end;
18131 +               struct page *page;
18132 +
18133 +               t_addr = bus_to_virt(phys_addr);
18134 +               t_end = t_addr + (size - 1);
18135 +          
18136 +               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
18137 +                       if(!PageReserved(page))
18138 +                               return NULL;
18139 +
18140 +               domid = DOMID_SELF;
18141 +       }
18142 +
18143 +       /*
18144 +        * Mappings have to be page-aligned
18145 +        */
18146 +       offset = phys_addr & ~PAGE_MASK;
18147 +       phys_addr &= PAGE_MASK;
18148 +       size = PAGE_ALIGN(last_addr+1) - phys_addr;
18149 +
18150 +       /*
18151 +        * Ok, go for it..
18152 +        */
18153 +       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
18154 +       if (!area)
18155 +               return NULL;
18156 +       area->phys_addr = phys_addr;
18157 +       addr = (void __iomem *) area->addr;
18158 +       flags |= _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
18159 +#ifdef __x86_64__
18160 +       flags |= _PAGE_USER;
18161 +#endif
18162 +       if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
18163 +                                    phys_addr>>PAGE_SHIFT,
18164 +                                    size, __pgprot(flags), domid)) {
18165 +               vunmap((void __force *) addr);
18166 +               return NULL;
18167 +       }
18168 +       return (void __iomem *) (offset + (char __iomem *)addr);
18169 +}
18170 +EXPORT_SYMBOL(__ioremap);
18171 +
18172 +/**
18173 + * ioremap_nocache     -   map bus memory into CPU space
18174 + * @offset:    bus address of the memory
18175 + * @size:      size of the resource to map
18176 + *
18177 + * ioremap_nocache performs a platform specific sequence of operations to
18178 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
18179 + * writew/writel functions and the other mmio helpers. The returned
18180 + * address is not guaranteed to be usable directly as a virtual
18181 + * address. 
18182 + *
18183 + * This version of ioremap ensures that the memory is marked uncachable
18184 + * on the CPU as well as honouring existing caching rules from things like
18185 + * the PCI bus. Note that there are other caches and buffers on many 
18186 + * busses. In particular driver authors should read up on PCI writes
18187 + *
18188 + * It's useful if some control registers are in such an area and
18189 + * write combining or read caching is not desirable:
18190 + * 
18191 + * Must be freed with iounmap.
18192 + */
18193 +
18194 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
18195 +{
18196 +       unsigned long last_addr;
18197 +       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
18198 +       if (!p) 
18199 +               return p; 
18200 +
18201 +       /* Guaranteed to be > phys_addr, as per __ioremap() */
18202 +       last_addr = phys_addr + size - 1;
18203 +
18204 +       if (is_local_lowmem(last_addr)) { 
18205 +               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
18206 +               unsigned long npages;
18207 +
18208 +               phys_addr &= PAGE_MASK;
18209 +
18210 +               /* This might overflow and become zero.. */
18211 +               last_addr = PAGE_ALIGN(last_addr);
18212 +
18213 +               /* .. but that's ok, because modulo-2**n arithmetic will make
18214 +               * the page-aligned "last - first" come out right.
18215 +               */
18216 +               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
18217 +
18218 +               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
18219 +                       iounmap(p); 
18220 +                       p = NULL;
18221 +               }
18222 +               global_flush_tlb();
18223 +       }
18224 +
18225 +       return p;                                       
18226 +}
18227 +EXPORT_SYMBOL(ioremap_nocache);
18228 +
18229 +/**
18230 + * iounmap - Free a IO remapping
18231 + * @addr: virtual address from ioremap_*
18232 + *
18233 + * Caller must ensure there is only one unmapping for the same pointer.
18234 + */
18235 +void iounmap(volatile void __iomem *addr)
18236 +{
18237 +       struct vm_struct *p, *o;
18238 +
18239 +       if ((void __force *)addr <= high_memory)
18240 +               return;
18241 +
18242 +       /*
18243 +        * __ioremap special-cases the PCI/ISA range by not instantiating a
18244 +        * vm_area and by simply returning an address into the kernel mapping
18245 +        * of ISA space.   So handle that here.
18246 +        */
18247 +       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
18248 +               return;
18249 +
18250 +       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
18251 +
18252 +       /* Use the vm area unlocked, assuming the caller
18253 +          ensures there isn't another iounmap for the same address
18254 +          in parallel. Reuse of the virtual address is prevented by
18255 +          leaving it in the global lists until we're done with it.
18256 +          cpa takes care of the direct mappings. */
18257 +       read_lock(&vmlist_lock);
18258 +       for (p = vmlist; p; p = p->next) {
18259 +               if (p->addr == addr)
18260 +                       break;
18261 +       }
18262 +       read_unlock(&vmlist_lock);
18263 +
18264 +       if (!p) {
18265 +               printk("iounmap: bad address %p\n", addr);
18266 +               dump_stack();
18267 +               return;
18268 +       }
18269 +
18270 +       /* Reset the direct mapping. Can block */
18271 +       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
18272 +               /* p->size includes the guard page, but cpa doesn't like that */
18273 +               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
18274 +                                (p->size - PAGE_SIZE) >> PAGE_SHIFT,
18275 +                                PAGE_KERNEL);
18276 +               global_flush_tlb();
18277 +       } 
18278 +
18279 +       /* Finally remove it */
18280 +       o = remove_vm_area((void *)addr);
18281 +       BUG_ON(p != o || o == NULL);
18282 +       kfree(p); 
18283 +}
18284 +EXPORT_SYMBOL(iounmap);
18285 +
18286 +#ifdef __i386__
18287 +
18288 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
18289 +{
18290 +       unsigned long offset, last_addr;
18291 +       unsigned int nrpages;
18292 +       enum fixed_addresses idx;
18293 +
18294 +       /* Don't allow wraparound or zero size */
18295 +       last_addr = phys_addr + size - 1;
18296 +       if (!size || last_addr < phys_addr)
18297 +               return NULL;
18298 +
18299 +       /*
18300 +        * Don't remap the low PCI/ISA area, it's always mapped..
18301 +        */
18302 +       if (xen_start_info->flags & SIF_PRIVILEGED &&
18303 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
18304 +               return isa_bus_to_virt(phys_addr);
18305 +
18306 +       /*
18307 +        * Mappings have to be page-aligned
18308 +        */
18309 +       offset = phys_addr & ~PAGE_MASK;
18310 +       phys_addr &= PAGE_MASK;
18311 +       size = PAGE_ALIGN(last_addr) - phys_addr;
18312 +
18313 +       /*
18314 +        * Mappings have to fit in the FIX_BTMAP area.
18315 +        */
18316 +       nrpages = size >> PAGE_SHIFT;
18317 +       if (nrpages > NR_FIX_BTMAPS)
18318 +               return NULL;
18319 +
18320 +       /*
18321 +        * Ok, go for it..
18322 +        */
18323 +       idx = FIX_BTMAP_BEGIN;
18324 +       while (nrpages > 0) {
18325 +               set_fixmap(idx, phys_addr);
18326 +               phys_addr += PAGE_SIZE;
18327 +               --idx;
18328 +               --nrpages;
18329 +       }
18330 +       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
18331 +}
18332 +
18333 +void __init bt_iounmap(void *addr, unsigned long size)
18334 +{
18335 +       unsigned long virt_addr;
18336 +       unsigned long offset;
18337 +       unsigned int nrpages;
18338 +       enum fixed_addresses idx;
18339 +
18340 +       virt_addr = (unsigned long)addr;
18341 +       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
18342 +               return;
18343 +       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
18344 +               return;
18345 +       offset = virt_addr & ~PAGE_MASK;
18346 +       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
18347 +
18348 +       idx = FIX_BTMAP_BEGIN;
18349 +       while (nrpages > 0) {
18350 +               clear_fixmap(idx);
18351 +               --idx;
18352 +               --nrpages;
18353 +       }
18354 +}
18355 +
18356 +#endif /* __i386__ */
18357 +
18358 +/*
18359 + * Local variables:
18360 + *  c-file-style: "linux"
18361 + *  indent-tabs-mode: t
18362 + *  c-indent-level: 8
18363 + *  c-basic-offset: 8
18364 + *  tab-width: 8
18365 + * End:
18366 + */
18367 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/Makefile tmp-linux-2.6-xen.patch/arch/i386/mm/Makefile
18368 --- ref-linux-2.6.16.9/arch/i386/mm/Makefile    2006-04-19 08:10:14.000000000 +0200
18369 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/Makefile       2006-04-10 00:05:52.000000000 +0200
18370 @@ -8,3 +8,11 @@ obj-$(CONFIG_NUMA) += discontig.o
18371  obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
18372  obj-$(CONFIG_HIGHMEM) += highmem.o
18373  obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
18374 +
18375 +ifdef CONFIG_XEN
18376 +include $(srctree)/scripts/Makefile.xen
18377 +
18378 +obj-y          += hypervisor.o
18379 +
18380 +obj-y := $(call cherrypickxen, $(obj-y))
18381 +endif
18382 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/pgtable.c tmp-linux-2.6-xen.patch/arch/i386/mm/pgtable.c
18383 --- ref-linux-2.6.16.9/arch/i386/mm/pgtable.c   2006-04-19 08:10:14.000000000 +0200
18384 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/pgtable.c      2006-04-10 00:05:52.000000000 +0200
18385 @@ -13,6 +13,7 @@
18386  #include <linux/slab.h>
18387  #include <linux/pagemap.h>
18388  #include <linux/spinlock.h>
18389 +#include <linux/module.h>
18390  
18391  #include <asm/system.h>
18392  #include <asm/pgtable.h>
18393 @@ -138,6 +139,10 @@ void set_pmd_pfn(unsigned long vaddr, un
18394         __flush_tlb_one(vaddr);
18395  }
18396  
18397 +static int nr_fixmaps = 0;
18398 +unsigned long __FIXADDR_TOP = 0xfffff000;
18399 +EXPORT_SYMBOL(__FIXADDR_TOP);
18400 +
18401  void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
18402  {
18403         unsigned long address = __fix_to_virt(idx);
18404 @@ -147,6 +152,13 @@ void __set_fixmap (enum fixed_addresses 
18405                 return;
18406         }
18407         set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
18408 +       nr_fixmaps++;
18409 +}
18410 +
18411 +void set_fixaddr_top(unsigned long top)
18412 +{
18413 +       BUG_ON(nr_fixmaps > 0);
18414 +       __FIXADDR_TOP = top - PAGE_SIZE;
18415  }
18416  
18417  pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
18418 diff -Nurp ref-linux-2.6.16.9/arch/i386/mm/pgtable-xen.c tmp-linux-2.6-xen.patch/arch/i386/mm/pgtable-xen.c
18419 --- ref-linux-2.6.16.9/arch/i386/mm/pgtable-xen.c       1970-01-01 01:00:00.000000000 +0100
18420 +++ tmp-linux-2.6-xen.patch/arch/i386/mm/pgtable-xen.c  2006-04-10 00:05:52.000000000 +0200
18421 @@ -0,0 +1,652 @@
18422 +/*
18423 + *  linux/arch/i386/mm/pgtable.c
18424 + */
18425 +
18426 +#include <linux/config.h>
18427 +#include <linux/sched.h>
18428 +#include <linux/kernel.h>
18429 +#include <linux/errno.h>
18430 +#include <linux/mm.h>
18431 +#include <linux/swap.h>
18432 +#include <linux/smp.h>
18433 +#include <linux/highmem.h>
18434 +#include <linux/slab.h>
18435 +#include <linux/pagemap.h>
18436 +#include <linux/spinlock.h>
18437 +#include <linux/module.h>
18438 +
18439 +#include <asm/system.h>
18440 +#include <asm/pgtable.h>
18441 +#include <asm/pgalloc.h>
18442 +#include <asm/fixmap.h>
18443 +#include <asm/e820.h>
18444 +#include <asm/tlb.h>
18445 +#include <asm/tlbflush.h>
18446 +#include <asm/io.h>
18447 +#include <asm/mmu_context.h>
18448 +
18449 +#include <xen/features.h>
18450 +#include <xen/foreign_page.h>
18451 +#include <asm/hypervisor.h>
18452 +
18453 +static void pgd_test_and_unpin(pgd_t *pgd);
18454 +
18455 +void show_mem(void)
18456 +{
18457 +       int total = 0, reserved = 0;
18458 +       int shared = 0, cached = 0;
18459 +       int highmem = 0;
18460 +       struct page *page;
18461 +       pg_data_t *pgdat;
18462 +       unsigned long i;
18463 +       struct page_state ps;
18464 +       unsigned long flags;
18465 +
18466 +       printk(KERN_INFO "Mem-info:\n");
18467 +       show_free_areas();
18468 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
18469 +       for_each_pgdat(pgdat) {
18470 +               pgdat_resize_lock(pgdat, &flags);
18471 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
18472 +                       page = pgdat_page_nr(pgdat, i);
18473 +                       total++;
18474 +                       if (PageHighMem(page))
18475 +                               highmem++;
18476 +                       if (PageReserved(page))
18477 +                               reserved++;
18478 +                       else if (PageSwapCache(page))
18479 +                               cached++;
18480 +                       else if (page_count(page))
18481 +                               shared += page_count(page) - 1;
18482 +               }
18483 +               pgdat_resize_unlock(pgdat, &flags);
18484 +       }
18485 +       printk(KERN_INFO "%d pages of RAM\n", total);
18486 +       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
18487 +       printk(KERN_INFO "%d reserved pages\n", reserved);
18488 +       printk(KERN_INFO "%d pages shared\n", shared);
18489 +       printk(KERN_INFO "%d pages swap cached\n", cached);
18490 +
18491 +       get_page_state(&ps);
18492 +       printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
18493 +       printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
18494 +       printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
18495 +       printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
18496 +       printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
18497 +}
18498 +
18499 +/*
18500 + * Associate a virtual page frame with a given physical page frame 
18501 + * and protection flags for that frame.
18502 + */ 
18503 +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
18504 +{
18505 +       pgd_t *pgd;
18506 +       pud_t *pud;
18507 +       pmd_t *pmd;
18508 +       pte_t *pte;
18509 +
18510 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18511 +       if (pgd_none(*pgd)) {
18512 +               BUG();
18513 +               return;
18514 +       }
18515 +       pud = pud_offset(pgd, vaddr);
18516 +       if (pud_none(*pud)) {
18517 +               BUG();
18518 +               return;
18519 +       }
18520 +       pmd = pmd_offset(pud, vaddr);
18521 +       if (pmd_none(*pmd)) {
18522 +               BUG();
18523 +               return;
18524 +       }
18525 +       pte = pte_offset_kernel(pmd, vaddr);
18526 +       /* <pfn,flags> stored as-is, to permit clearing entries */
18527 +       set_pte(pte, pfn_pte(pfn, flags));
18528 +
18529 +       /*
18530 +        * It's enough to flush this one mapping.
18531 +        * (PGE mappings get flushed as well)
18532 +        */
18533 +       __flush_tlb_one(vaddr);
18534 +}
18535 +
18536 +/*
18537 + * Associate a virtual page frame with a given physical page frame 
18538 + * and protection flags for that frame.
18539 + */ 
18540 +static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
18541 +                          pgprot_t flags)
18542 +{
18543 +       pgd_t *pgd;
18544 +       pud_t *pud;
18545 +       pmd_t *pmd;
18546 +       pte_t *pte;
18547 +
18548 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18549 +       if (pgd_none(*pgd)) {
18550 +               BUG();
18551 +               return;
18552 +       }
18553 +       pud = pud_offset(pgd, vaddr);
18554 +       if (pud_none(*pud)) {
18555 +               BUG();
18556 +               return;
18557 +       }
18558 +       pmd = pmd_offset(pud, vaddr);
18559 +       if (pmd_none(*pmd)) {
18560 +               BUG();
18561 +               return;
18562 +       }
18563 +       pte = pte_offset_kernel(pmd, vaddr);
18564 +       /* <pfn,flags> stored as-is, to permit clearing entries */
18565 +       set_pte(pte, pfn_pte_ma(pfn, flags));
18566 +
18567 +       /*
18568 +        * It's enough to flush this one mapping.
18569 +        * (PGE mappings get flushed as well)
18570 +        */
18571 +       __flush_tlb_one(vaddr);
18572 +}
18573 +
18574 +/*
18575 + * Associate a large virtual page frame with a given physical page frame 
18576 + * and protection flags for that frame. pfn is for the base of the page,
18577 + * vaddr is what the page gets mapped to - both must be properly aligned. 
18578 + * The pmd must already be instantiated. Assumes PAE mode.
18579 + */ 
18580 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
18581 +{
18582 +       pgd_t *pgd;
18583 +       pud_t *pud;
18584 +       pmd_t *pmd;
18585 +
18586 +       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
18587 +               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
18588 +               return; /* BUG(); */
18589 +       }
18590 +       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
18591 +               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
18592 +               return; /* BUG(); */
18593 +       }
18594 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18595 +       if (pgd_none(*pgd)) {
18596 +               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
18597 +               return; /* BUG(); */
18598 +       }
18599 +       pud = pud_offset(pgd, vaddr);
18600 +       pmd = pmd_offset(pud, vaddr);
18601 +       set_pmd(pmd, pfn_pmd(pfn, flags));
18602 +       /*
18603 +        * It's enough to flush this one mapping.
18604 +        * (PGE mappings get flushed as well)
18605 +        */
18606 +       __flush_tlb_one(vaddr);
18607 +}
18608 +
18609 +static int nr_fixmaps = 0;
18610 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
18611 +EXPORT_SYMBOL(__FIXADDR_TOP);
18612 +
18613 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
18614 +{
18615 +       unsigned long address = __fix_to_virt(idx);
18616 +
18617 +       if (idx >= __end_of_fixed_addresses) {
18618 +               BUG();
18619 +               return;
18620 +       }
18621 +       switch (idx) {
18622 +       case FIX_WP_TEST:
18623 +#ifdef CONFIG_X86_F00F_BUG
18624 +       case FIX_F00F_IDT:
18625 +#endif
18626 +               set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
18627 +               break;
18628 +       default:
18629 +               set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
18630 +               break;
18631 +       }
18632 +       nr_fixmaps++;
18633 +}
18634 +
18635 +void set_fixaddr_top(unsigned long top)
18636 +{
18637 +       BUG_ON(nr_fixmaps > 0);
18638 +       __FIXADDR_TOP = top - PAGE_SIZE;
18639 +}
18640 +
18641 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
18642 +{
18643 +       pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
18644 +       if (pte)
18645 +               make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
18646 +       return pte;
18647 +}
18648 +
18649 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18650 +{
18651 +       struct page *pte;
18652 +
18653 +#ifdef CONFIG_HIGHPTE
18654 +       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18655 +#else
18656 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18657 +       if (pte) {
18658 +               SetPageForeign(pte, pte_free);
18659 +               set_page_count(pte, 1);
18660 +       }
18661 +#endif
18662 +       return pte;
18663 +}
18664 +
18665 +void pte_free(struct page *pte)
18666 +{
18667 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
18668 +
18669 +       if (!pte_write(*virt_to_ptep(va)))
18670 +               BUG_ON(HYPERVISOR_update_va_mapping(
18671 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
18672 +
18673 +       ClearPageForeign(pte);
18674 +       set_page_count(pte, 1);
18675 +
18676 +       __free_page(pte);
18677 +}
18678 +
18679 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
18680 +{
18681 +       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18682 +}
18683 +
18684 +/*
18685 + * List of all pgd's needed for non-PAE so it can invalidate entries
18686 + * in both cached and uncached pgd's; not needed for PAE since the
18687 + * kernel pmd is shared. If PAE were not to share the pmd a similar
18688 + * tactic would be needed. This is essentially codepath-based locking
18689 + * against pageattr.c; it is the unique case in which a valid change
18690 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
18691 + * vmalloc faults work because attached pagetables are never freed.
18692 + * The locking scheme was chosen on the basis of manfred's
18693 + * recommendations and having no core impact whatsoever.
18694 + * -- wli
18695 + */
18696 +DEFINE_SPINLOCK(pgd_lock);
18697 +struct page *pgd_list;
18698 +
18699 +static inline void pgd_list_add(pgd_t *pgd)
18700 +{
18701 +       struct page *page = virt_to_page(pgd);
18702 +       page->index = (unsigned long)pgd_list;
18703 +       if (pgd_list)
18704 +               set_page_private(pgd_list, (unsigned long)&page->index);
18705 +       pgd_list = page;
18706 +       set_page_private(page, (unsigned long)&pgd_list);
18707 +}
18708 +
18709 +static inline void pgd_list_del(pgd_t *pgd)
18710 +{
18711 +       struct page *next, **pprev, *page = virt_to_page(pgd);
18712 +       next = (struct page *)page->index;
18713 +       pprev = (struct page **)page_private(page);
18714 +       *pprev = next;
18715 +       if (next)
18716 +               set_page_private(next, (unsigned long)pprev);
18717 +}
18718 +
18719 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
18720 +{
18721 +       unsigned long flags;
18722 +
18723 +       if (PTRS_PER_PMD > 1) {
18724 +               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18725 +                       int rc = xen_create_contiguous_region(
18726 +                               (unsigned long)pgd, 0, 32);
18727 +                       BUG_ON(rc);
18728 +               }
18729 +               if (HAVE_SHARED_KERNEL_PMD)
18730 +                       memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
18731 +                              swapper_pg_dir + USER_PTRS_PER_PGD,
18732 +                              (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
18733 +       } else {
18734 +               spin_lock_irqsave(&pgd_lock, flags);
18735 +               memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
18736 +                      swapper_pg_dir + USER_PTRS_PER_PGD,
18737 +                      (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
18738 +               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18739 +               pgd_list_add(pgd);
18740 +               spin_unlock_irqrestore(&pgd_lock, flags);
18741 +       }
18742 +}
18743 +
18744 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
18745 +{
18746 +       unsigned long flags; /* can be called from interrupt context */
18747 +
18748 +       if (PTRS_PER_PMD > 1) {
18749 +               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18750 +                       xen_destroy_contiguous_region((unsigned long)pgd, 0);
18751 +       } else {
18752 +               spin_lock_irqsave(&pgd_lock, flags);
18753 +               pgd_list_del(pgd);
18754 +               spin_unlock_irqrestore(&pgd_lock, flags);
18755 +
18756 +               pgd_test_and_unpin(pgd);
18757 +       }
18758 +}
18759 +
18760 +pgd_t *pgd_alloc(struct mm_struct *mm)
18761 +{
18762 +       int i;
18763 +       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
18764 +
18765 +       pgd_test_and_unpin(pgd);
18766 +
18767 +       if (PTRS_PER_PMD == 1 || !pgd)
18768 +               return pgd;
18769 +
18770 +       for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
18771 +               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18772 +               if (!pmd)
18773 +                       goto out_oom;
18774 +               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18775 +       }
18776 +
18777 +       if (!HAVE_SHARED_KERNEL_PMD) {
18778 +               unsigned long flags;
18779 +
18780 +               for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18781 +                       pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18782 +                       if (!pmd)
18783 +                               goto out_oom;
18784 +                       set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
18785 +               }
18786 +
18787 +               spin_lock_irqsave(&pgd_lock, flags);
18788 +               for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18789 +                       unsigned long v = (unsigned long)i << PGDIR_SHIFT;
18790 +                       pgd_t *kpgd = pgd_offset_k(v);
18791 +                       pud_t *kpud = pud_offset(kpgd, v);
18792 +                       pmd_t *kpmd = pmd_offset(kpud, v);
18793 +                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18794 +                       memcpy(pmd, kpmd, PAGE_SIZE);
18795 +                       make_lowmem_page_readonly(
18796 +                               pmd, XENFEAT_writable_page_tables);
18797 +               }
18798 +               pgd_list_add(pgd);
18799 +               spin_unlock_irqrestore(&pgd_lock, flags);
18800 +       }
18801 +
18802 +       return pgd;
18803 +
18804 +out_oom:
18805 +       for (i--; i >= 0; i--)
18806 +               kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
18807 +       kmem_cache_free(pgd_cache, pgd);
18808 +       return NULL;
18809 +}
18810 +
18811 +void pgd_free(pgd_t *pgd)
18812 +{
18813 +       int i;
18814 +
18815 +       pgd_test_and_unpin(pgd);
18816 +
18817 +       /* in the PAE case user pgd entries are overwritten before usage */
18818 +       if (PTRS_PER_PMD > 1) {
18819 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
18820 +                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18821 +                       kmem_cache_free(pmd_cache, pmd);
18822 +               }
18823 +               if (!HAVE_SHARED_KERNEL_PMD) {
18824 +                       unsigned long flags;
18825 +                       spin_lock_irqsave(&pgd_lock, flags);
18826 +                       pgd_list_del(pgd);
18827 +                       spin_unlock_irqrestore(&pgd_lock, flags);
18828 +                       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18829 +                               pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18830 +                               make_lowmem_page_writable(
18831 +                                       pmd, XENFEAT_writable_page_tables);
18832 +                               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18833 +                               kmem_cache_free(pmd_cache, pmd);
18834 +                       }
18835 +               }
18836 +       }
18837 +       /* in the non-PAE case, free_pgtables() clears user pgd entries */
18838 +       kmem_cache_free(pgd_cache, pgd);
18839 +}
18840 +
18841 +void make_lowmem_page_readonly(void *va, unsigned int feature)
18842 +{
18843 +       pte_t *pte;
18844 +       int rc;
18845 +
18846 +       if (xen_feature(feature))
18847 +               return;
18848 +
18849 +       pte = virt_to_ptep(va);
18850 +       rc = HYPERVISOR_update_va_mapping(
18851 +               (unsigned long)va, pte_wrprotect(*pte), 0);
18852 +       BUG_ON(rc);
18853 +}
18854 +
18855 +void make_lowmem_page_writable(void *va, unsigned int feature)
18856 +{
18857 +       pte_t *pte;
18858 +       int rc;
18859 +
18860 +       if (xen_feature(feature))
18861 +               return;
18862 +
18863 +       pte = virt_to_ptep(va);
18864 +       rc = HYPERVISOR_update_va_mapping(
18865 +               (unsigned long)va, pte_mkwrite(*pte), 0);
18866 +       BUG_ON(rc);
18867 +}
18868 +
18869 +void make_page_readonly(void *va, unsigned int feature)
18870 +{
18871 +       pte_t *pte;
18872 +       int rc;
18873 +
18874 +       if (xen_feature(feature))
18875 +               return;
18876 +
18877 +       pte = virt_to_ptep(va);
18878 +       rc = HYPERVISOR_update_va_mapping(
18879 +               (unsigned long)va, pte_wrprotect(*pte), 0);
18880 +       if (rc) /* fallback? */
18881 +               xen_l1_entry_update(pte, pte_wrprotect(*pte));
18882 +       if ((unsigned long)va >= (unsigned long)high_memory) {
18883 +               unsigned long pfn = pte_pfn(*pte);
18884 +#ifdef CONFIG_HIGHMEM
18885 +               if (pfn >= highstart_pfn)
18886 +                       kmap_flush_unused(); /* flush stale writable kmaps */
18887 +               else
18888 +#endif
18889 +                       make_lowmem_page_readonly(
18890 +                               phys_to_virt(pfn << PAGE_SHIFT), feature); 
18891 +       }
18892 +}
18893 +
18894 +void make_page_writable(void *va, unsigned int feature)
18895 +{
18896 +       pte_t *pte;
18897 +       int rc;
18898 +
18899 +       if (xen_feature(feature))
18900 +               return;
18901 +
18902 +       pte = virt_to_ptep(va);
18903 +       rc = HYPERVISOR_update_va_mapping(
18904 +               (unsigned long)va, pte_mkwrite(*pte), 0);
18905 +       if (rc) /* fallback? */
18906 +               xen_l1_entry_update(pte, pte_mkwrite(*pte));
18907 +       if ((unsigned long)va >= (unsigned long)high_memory) {
18908 +               unsigned long pfn = pte_pfn(*pte); 
18909 +#ifdef CONFIG_HIGHMEM
18910 +               if (pfn < highstart_pfn)
18911 +#endif
18912 +                       make_lowmem_page_writable(
18913 +                               phys_to_virt(pfn << PAGE_SHIFT), feature);
18914 +       }
18915 +}
18916 +
18917 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18918 +{
18919 +       if (xen_feature(feature))
18920 +               return;
18921 +
18922 +       while (nr-- != 0) {
18923 +               make_page_readonly(va, feature);
18924 +               va = (void *)((unsigned long)va + PAGE_SIZE);
18925 +       }
18926 +}
18927 +
18928 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18929 +{
18930 +       if (xen_feature(feature))
18931 +               return;
18932 +
18933 +       while (nr-- != 0) {
18934 +               make_page_writable(va, feature);
18935 +               va = (void *)((unsigned long)va + PAGE_SIZE);
18936 +       }
18937 +}
18938 +
18939 +static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
18940 +{
18941 +       struct page *page = virt_to_page(pt);
18942 +       unsigned long pfn = page_to_pfn(page);
18943 +
18944 +       if (PageHighMem(page))
18945 +               return;
18946 +       BUG_ON(HYPERVISOR_update_va_mapping(
18947 +               (unsigned long)__va(pfn << PAGE_SHIFT),
18948 +               pfn_pte(pfn, flags), 0));
18949 +}
18950 +
18951 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18952 +{
18953 +       pgd_t *pgd = pgd_base;
18954 +       pud_t *pud;
18955 +       pmd_t *pmd;
18956 +       pte_t *pte;
18957 +       int    g, u, m;
18958 +
18959 +       if (xen_feature(XENFEAT_auto_translated_physmap))
18960 +               return;
18961 +
18962 +       for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18963 +               if (pgd_none(*pgd))
18964 +                       continue;
18965 +               pud = pud_offset(pgd, 0);
18966 +               if (PTRS_PER_PUD > 1) /* not folded */
18967 +                       pgd_walk_set_prot(pud,flags);
18968 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18969 +                       if (pud_none(*pud))
18970 +                               continue;
18971 +                       pmd = pmd_offset(pud, 0);
18972 +                       if (PTRS_PER_PMD > 1) /* not folded */
18973 +                               pgd_walk_set_prot(pmd,flags);
18974 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18975 +                               if (pmd_none(*pmd))
18976 +                                       continue;
18977 +                               pte = pte_offset_kernel(pmd,0);
18978 +                               pgd_walk_set_prot(pte,flags);
18979 +                       }
18980 +               }
18981 +       }
18982 +
18983 +       BUG_ON(HYPERVISOR_update_va_mapping(
18984 +               (unsigned long)pgd_base,
18985 +               pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18986 +               UVMF_TLB_FLUSH));
18987 +}
18988 +
18989 +static void __pgd_pin(pgd_t *pgd)
18990 +{
18991 +       pgd_walk(pgd, PAGE_KERNEL_RO);
18992 +       xen_pgd_pin(__pa(pgd));
18993 +       set_bit(PG_pinned, &virt_to_page(pgd)->flags);
18994 +}
18995 +
18996 +static void __pgd_unpin(pgd_t *pgd)
18997 +{
18998 +       xen_pgd_unpin(__pa(pgd));
18999 +       pgd_walk(pgd, PAGE_KERNEL);
19000 +       clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
19001 +}
19002 +
19003 +static void pgd_test_and_unpin(pgd_t *pgd)
19004 +{
19005 +       if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
19006 +               __pgd_unpin(pgd);
19007 +}
19008 +
19009 +void mm_pin(struct mm_struct *mm)
19010 +{
19011 +       if (xen_feature(XENFEAT_writable_page_tables))
19012 +           return;
19013 +       spin_lock(&mm->page_table_lock);
19014 +       __pgd_pin(mm->pgd);
19015 +       spin_unlock(&mm->page_table_lock);
19016 +}
19017 +
19018 +void mm_unpin(struct mm_struct *mm)
19019 +{
19020 +       if (xen_feature(XENFEAT_writable_page_tables))
19021 +           return;
19022 +       spin_lock(&mm->page_table_lock);
19023 +       __pgd_unpin(mm->pgd);
19024 +       spin_unlock(&mm->page_table_lock);
19025 +}
19026 +
19027 +void mm_pin_all(void)
19028 +{
19029 +       struct page *page;
19030 +       if (xen_feature(XENFEAT_writable_page_tables))
19031 +           return;
19032 +       for (page = pgd_list; page; page = (struct page *)page->index) {
19033 +               if (!test_bit(PG_pinned, &page->flags))
19034 +                       __pgd_pin((pgd_t *)page_address(page));
19035 +       }
19036 +}
19037 +
19038 +void _arch_exit_mmap(struct mm_struct *mm)
19039 +{
19040 +       struct task_struct *tsk = current;
19041 +
19042 +       task_lock(tsk);
19043 +
19044 +       /*
19045 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
19046 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
19047 +        */
19048 +       if (tsk->active_mm == mm) {
19049 +               tsk->active_mm = &init_mm;
19050 +               atomic_inc(&init_mm.mm_count);
19051 +
19052 +               switch_mm(mm, &init_mm, tsk);
19053 +
19054 +               atomic_dec(&mm->mm_count);
19055 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
19056 +       }
19057 +
19058 +       task_unlock(tsk);
19059 +
19060 +       if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
19061 +           (atomic_read(&mm->mm_count) == 1))
19062 +               mm_unpin(mm);
19063 +}
19064 +
19065 +/*
19066 + * Local variables:
19067 + *  c-file-style: "linux"
19068 + *  indent-tabs-mode: t
19069 + *  c-indent-level: 8
19070 + *  c-basic-offset: 8
19071 + *  tab-width: 8
19072 + * End:
19073 + */
19074 diff -Nurp ref-linux-2.6.16.9/arch/i386/pci/irq-xen.c tmp-linux-2.6-xen.patch/arch/i386/pci/irq-xen.c
19075 --- ref-linux-2.6.16.9/arch/i386/pci/irq-xen.c  1970-01-01 01:00:00.000000000 +0100
19076 +++ tmp-linux-2.6-xen.patch/arch/i386/pci/irq-xen.c     2006-04-10 00:05:52.000000000 +0200
19077 @@ -0,0 +1,1202 @@
19078 +/*
19079 + *     Low-Level PCI Support for PC -- Routing of Interrupts
19080 + *
19081 + *     (c) 1999--2000 Martin Mares <mj@ucw.cz>
19082 + */
19083 +
19084 +#include <linux/config.h>
19085 +#include <linux/types.h>
19086 +#include <linux/kernel.h>
19087 +#include <linux/pci.h>
19088 +#include <linux/init.h>
19089 +#include <linux/slab.h>
19090 +#include <linux/interrupt.h>
19091 +#include <linux/dmi.h>
19092 +#include <asm/io.h>
19093 +#include <asm/smp.h>
19094 +#include <asm/io_apic.h>
19095 +#include <linux/irq.h>
19096 +#include <linux/acpi.h>
19097 +
19098 +#include "pci.h"
19099 +
19100 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
19101 +#define PIRQ_VERSION 0x0100
19102 +
19103 +static int broken_hp_bios_irq9;
19104 +static int acer_tm360_irqrouting;
19105 +
19106 +static struct irq_routing_table *pirq_table;
19107 +
19108 +static int pirq_enable_irq(struct pci_dev *dev);
19109 +
19110 +/*
19111 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
19112 + * Avoid using: 13, 14 and 15 (FP error and IDE).
19113 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
19114 + */
19115 +unsigned int pcibios_irq_mask = 0xfff8;
19116 +
19117 +static int pirq_penalty[16] = {
19118 +       1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
19119 +       0, 0, 0, 0, 1000, 100000, 100000, 100000
19120 +};
19121 +
19122 +struct irq_router {
19123 +       char *name;
19124 +       u16 vendor, device;
19125 +       int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
19126 +       int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
19127 +};
19128 +
19129 +struct irq_router_handler {
19130 +       u16 vendor;
19131 +       int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
19132 +};
19133 +
19134 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
19135 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
19136 +
19137 +/*
19138 + *  Check passed address for the PCI IRQ Routing Table signature
19139 + *  and perform checksum verification.
19140 + */
19141 +
19142 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
19143 +{
19144 +       struct irq_routing_table *rt;
19145 +       int i;
19146 +       u8 sum;
19147 +
19148 +       rt = (struct irq_routing_table *) addr;
19149 +       if (rt->signature != PIRQ_SIGNATURE ||
19150 +           rt->version != PIRQ_VERSION ||
19151 +           rt->size % 16 ||
19152 +           rt->size < sizeof(struct irq_routing_table))
19153 +               return NULL;
19154 +       sum = 0;
19155 +       for (i=0; i < rt->size; i++)
19156 +               sum += addr[i];
19157 +       if (!sum) {
19158 +               DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
19159 +               return rt;
19160 +       }
19161 +       return NULL;
19162 +}
19163 +
19164 +
19165 +
19166 +/*
19167 + *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
19168 + */
19169 +
19170 +static struct irq_routing_table * __init pirq_find_routing_table(void)
19171 +{
19172 +       u8 *addr;
19173 +       struct irq_routing_table *rt;
19174 +
19175 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
19176 +       if (pirq_table_addr) {
19177 +               rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
19178 +               if (rt)
19179 +                       return rt;
19180 +               printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
19181 +       }
19182 +       for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
19183 +               rt = pirq_check_routing_table(addr);
19184 +               if (rt)
19185 +                       return rt;
19186 +       }
19187 +#endif
19188 +       
19189 +       return NULL;
19190 +}
19191 +
19192 +/*
19193 + *  If we have a IRQ routing table, use it to search for peer host
19194 + *  bridges.  It's a gross hack, but since there are no other known
19195 + *  ways how to get a list of buses, we have to go this way.
19196 + */
19197 +
19198 +static void __init pirq_peer_trick(void)
19199 +{
19200 +       struct irq_routing_table *rt = pirq_table;
19201 +       u8 busmap[256];
19202 +       int i;
19203 +       struct irq_info *e;
19204 +
19205 +       memset(busmap, 0, sizeof(busmap));
19206 +       for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
19207 +               e = &rt->slots[i];
19208 +#ifdef DEBUG
19209 +               {
19210 +                       int j;
19211 +                       DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
19212 +                       for(j=0; j<4; j++)
19213 +                               DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
19214 +                       DBG("\n");
19215 +               }
19216 +#endif
19217 +               busmap[e->bus] = 1;
19218 +       }
19219 +       for(i = 1; i < 256; i++) {
19220 +               if (!busmap[i] || pci_find_bus(0, i))
19221 +                       continue;
19222 +               if (pci_scan_bus(i, &pci_root_ops, NULL))
19223 +                       printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
19224 +       }
19225 +       pcibios_last_bus = -1;
19226 +}
19227 +
19228 +/*
19229 + *  Code for querying and setting of IRQ routes on various interrupt routers.
19230 + */
19231 +
19232 +void eisa_set_level_irq(unsigned int irq)
19233 +{
19234 +       unsigned char mask = 1 << (irq & 7);
19235 +       unsigned int port = 0x4d0 + (irq >> 3);
19236 +       unsigned char val;
19237 +       static u16 eisa_irq_mask;
19238 +
19239 +       if (irq >= 16 || (1 << irq) & eisa_irq_mask)
19240 +               return;
19241 +
19242 +       eisa_irq_mask |= (1 << irq);
19243 +       printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
19244 +       val = inb(port);
19245 +       if (!(val & mask)) {
19246 +               DBG(KERN_DEBUG " -> edge");
19247 +               outb(val | mask, port);
19248 +       }
19249 +}
19250 +
19251 +/*
19252 + * Common IRQ routing practice: nybbles in config space,
19253 + * offset by some magic constant.
19254 + */
19255 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
19256 +{
19257 +       u8 x;
19258 +       unsigned reg = offset + (nr >> 1);
19259 +
19260 +       pci_read_config_byte(router, reg, &x);
19261 +       return (nr & 1) ? (x >> 4) : (x & 0xf);
19262 +}
19263 +
19264 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
19265 +{
19266 +       u8 x;
19267 +       unsigned reg = offset + (nr >> 1);
19268 +
19269 +       pci_read_config_byte(router, reg, &x);
19270 +       x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
19271 +       pci_write_config_byte(router, reg, x);
19272 +}
19273 +
19274 +/*
19275 + * ALI pirq entries are damn ugly, and completely undocumented.
19276 + * This has been figured out from pirq tables, and it's not a pretty
19277 + * picture.
19278 + */
19279 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19280 +{
19281 +       static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
19282 +
19283 +       return irqmap[read_config_nybble(router, 0x48, pirq-1)];
19284 +}
19285 +
19286 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19287 +{
19288 +       static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
19289 +       unsigned int val = irqmap[irq];
19290 +               
19291 +       if (val) {
19292 +               write_config_nybble(router, 0x48, pirq-1, val);
19293 +               return 1;
19294 +       }
19295 +       return 0;
19296 +}
19297 +
19298 +/*
19299 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
19300 + * just a pointer to the config space.
19301 + */
19302 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19303 +{
19304 +       u8 x;
19305 +
19306 +       pci_read_config_byte(router, pirq, &x);
19307 +       return (x < 16) ? x : 0;
19308 +}
19309 +
19310 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19311 +{
19312 +       pci_write_config_byte(router, pirq, irq);
19313 +       return 1;
19314 +}
19315 +
19316 +/*
19317 + * The VIA pirq rules are nibble-based, like ALI,
19318 + * but without the ugly irq number munging.
19319 + * However, PIRQD is in the upper instead of lower 4 bits.
19320 + */
19321 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19322 +{
19323 +       return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
19324 +}
19325 +
19326 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19327 +{
19328 +       write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
19329 +       return 1;
19330 +}
19331 +
19332 +/*
19333 + * The VIA pirq rules are nibble-based, like ALI,
19334 + * but without the ugly irq number munging.
19335 + * However, for 82C586, nibble map is different .
19336 + */
19337 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19338 +{
19339 +       static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
19340 +       return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
19341 +}
19342 +
19343 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19344 +{
19345 +       static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
19346 +       write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
19347 +       return 1;
19348 +}
19349 +
19350 +/*
19351 + * ITE 8330G pirq rules are nibble-based
19352 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
19353 + *       2+3 are both mapped to irq 9 on my system
19354 + */
19355 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19356 +{
19357 +       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
19358 +       return read_config_nybble(router,0x43, pirqmap[pirq-1]);
19359 +}
19360 +
19361 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19362 +{
19363 +       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
19364 +       write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
19365 +       return 1;
19366 +}
19367 +
19368 +/*
19369 + * OPTI: high four bits are nibble pointer..
19370 + * I wonder what the low bits do?
19371 + */
19372 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19373 +{
19374 +       return read_config_nybble(router, 0xb8, pirq >> 4);
19375 +}
19376 +
19377 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19378 +{
19379 +       write_config_nybble(router, 0xb8, pirq >> 4, irq);
19380 +       return 1;
19381 +}
19382 +
19383 +/*
19384 + * Cyrix: nibble offset 0x5C
19385 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA 
19386 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
19387 + */
19388 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19389 +{
19390 +       return read_config_nybble(router, 0x5C, (pirq-1)^1);
19391 +}
19392 +
19393 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19394 +{
19395 +       write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
19396 +       return 1;
19397 +}
19398 +
19399 +/*
19400 + *     PIRQ routing for SiS 85C503 router used in several SiS chipsets.
19401 + *     We have to deal with the following issues here:
19402 + *     - vendors have different ideas about the meaning of link values
19403 + *     - some onboard devices (integrated in the chipset) have special
19404 + *       links and are thus routed differently (i.e. not via PCI INTA-INTD)
19405 + *     - different revision of the router have a different layout for
19406 + *       the routing registers, particularly for the onchip devices
19407 + *
19408 + *     For all routing registers the common thing is we have one byte
19409 + *     per routeable link which is defined as:
19410 + *              bit 7      IRQ mapping enabled (0) or disabled (1)
19411 + *              bits [6:4] reserved (sometimes used for onchip devices)
19412 + *              bits [3:0] IRQ to map to
19413 + *                  allowed: 3-7, 9-12, 14-15
19414 + *                  reserved: 0, 1, 2, 8, 13
19415 + *
19416 + *     The config-space registers located at 0x41/0x42/0x43/0x44 are
19417 + *     always used to route the normal PCI INT A/B/C/D respectively.
19418 + *     Apparently there are systems implementing PCI routing table using
19419 + *     link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
19420 + *     We try our best to handle both link mappings.
19421 + *     
19422 + *     Currently (2003-05-21) it appears most SiS chipsets follow the
19423 + *     definition of routing registers from the SiS-5595 southbridge.
19424 + *     According to the SiS 5595 datasheets the revision id's of the
19425 + *     router (ISA-bridge) should be 0x01 or 0xb0.
19426 + *
19427 + *     Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
19428 + *     Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
19429 + *     They seem to work with the current routing code. However there is
19430 + *     some concern because of the two USB-OHCI HCs (original SiS 5595
19431 + *     had only one). YMMV.
19432 + *
19433 + *     Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
19434 + *
19435 + *     0x61:   IDEIRQ:
19436 + *             bits [6:5] must be written 01
19437 + *             bit 4 channel-select primary (0), secondary (1)
19438 + *
19439 + *     0x62:   USBIRQ:
19440 + *             bit 6 OHCI function disabled (0), enabled (1)
19441 + *     
19442 + *     0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
19443 + *
19444 + *     0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
19445 + *
19446 + *     We support USBIRQ (in addition to INTA-INTD) and keep the
19447 + *     IDE, ACPI and DAQ routing untouched as set by the BIOS.
19448 + *
19449 + *     Currently the only reported exception is the new SiS 65x chipset
19450 + *     which includes the SiS 69x southbridge. Here we have the 85C503
19451 + *     router revision 0x04 and there are changes in the register layout
19452 + *     mostly related to the different USB HCs with USB 2.0 support.
19453 + *
19454 + *     Onchip routing for router rev-id 0x04 (try-and-error observation)
19455 + *
19456 + *     0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
19457 + *                             bit 6-4 are probably unused, not like 5595
19458 + */
19459 +
19460 +#define PIRQ_SIS_IRQ_MASK      0x0f
19461 +#define PIRQ_SIS_IRQ_DISABLE   0x80
19462 +#define PIRQ_SIS_USB_ENABLE    0x40
19463 +
19464 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19465 +{
19466 +       u8 x;
19467 +       int reg;
19468 +
19469 +       reg = pirq;
19470 +       if (reg >= 0x01 && reg <= 0x04)
19471 +               reg += 0x40;
19472 +       pci_read_config_byte(router, reg, &x);
19473 +       return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
19474 +}
19475 +
19476 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19477 +{
19478 +       u8 x;
19479 +       int reg;
19480 +
19481 +       reg = pirq;
19482 +       if (reg >= 0x01 && reg <= 0x04)
19483 +               reg += 0x40;
19484 +       pci_read_config_byte(router, reg, &x);
19485 +       x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
19486 +       x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
19487 +       pci_write_config_byte(router, reg, x);
19488 +       return 1;
19489 +}
19490 +
19491 +
19492 +/*
19493 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
19494 + *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
19495 + *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
19496 + *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
19497 + *       for the busbridge to the docking station.
19498 + */
19499 +
19500 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19501 +{
19502 +       if (pirq > 8) {
19503 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19504 +               return 0;
19505 +       }
19506 +       return read_config_nybble(router, 0x74, pirq-1);
19507 +}
19508 +
19509 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19510 +{
19511 +       if (pirq > 8) {
19512 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19513 +               return 0;
19514 +       }
19515 +       write_config_nybble(router, 0x74, pirq-1, irq);
19516 +       return 1;
19517 +}
19518 +
19519 +/*
19520 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
19521 + * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
19522 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
19523 + * register is a straight binary coding of desired PIC IRQ (low nibble).
19524 + *
19525 + * The 'link' value in the PIRQ table is already in the correct format
19526 + * for the Index register.  There are some special index values:
19527 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
19528 + * and 0x03 for SMBus.
19529 + */
19530 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19531 +{
19532 +       outb_p(pirq, 0xc00);
19533 +       return inb(0xc01) & 0xf;
19534 +}
19535 +
19536 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19537 +{
19538 +       outb_p(pirq, 0xc00);
19539 +       outb_p(irq, 0xc01);
19540 +       return 1;
19541 +}
19542 +
19543 +/* Support for AMD756 PCI IRQ Routing
19544 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
19545 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
19546 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
19547 + * The AMD756 pirq rules are nibble-based
19548 + * offset 0x56 0-3 PIRQA  4-7  PIRQB
19549 + * offset 0x57 0-3 PIRQC  4-7  PIRQD
19550 + */
19551 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19552 +{
19553 +       u8 irq;
19554 +       irq = 0;
19555 +       if (pirq <= 4)
19556 +       {
19557 +               irq = read_config_nybble(router, 0x56, pirq - 1);
19558 +       }
19559 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
19560 +               dev->vendor, dev->device, pirq, irq);
19561 +       return irq;
19562 +}
19563 +
19564 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19565 +{
19566 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 
19567 +               dev->vendor, dev->device, pirq, irq);
19568 +       if (pirq <= 4)
19569 +       {
19570 +               write_config_nybble(router, 0x56, pirq - 1, irq);
19571 +       }
19572 +       return 1;
19573 +}
19574 +
19575 +#ifdef CONFIG_PCI_BIOS
19576 +
19577 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19578 +{
19579 +       struct pci_dev *bridge;
19580 +       int pin = pci_get_interrupt_pin(dev, &bridge);
19581 +       return pcibios_set_irq_routing(bridge, pin, irq);
19582 +}
19583 +
19584 +#endif
19585 +
19586 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19587 +{
19588 +       static struct pci_device_id pirq_440gx[] = {
19589 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
19590 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
19591 +               { },
19592 +       };
19593 +
19594 +       /* 440GX has a proprietary PIRQ router -- don't use it */
19595 +       if (pci_dev_present(pirq_440gx))
19596 +               return 0;
19597 +
19598 +       switch(device)
19599 +       {
19600 +               case PCI_DEVICE_ID_INTEL_82371FB_0:
19601 +               case PCI_DEVICE_ID_INTEL_82371SB_0:
19602 +               case PCI_DEVICE_ID_INTEL_82371AB_0:
19603 +               case PCI_DEVICE_ID_INTEL_82371MX:
19604 +               case PCI_DEVICE_ID_INTEL_82443MX_0:
19605 +               case PCI_DEVICE_ID_INTEL_82801AA_0:
19606 +               case PCI_DEVICE_ID_INTEL_82801AB_0:
19607 +               case PCI_DEVICE_ID_INTEL_82801BA_0:
19608 +               case PCI_DEVICE_ID_INTEL_82801BA_10:
19609 +               case PCI_DEVICE_ID_INTEL_82801CA_0:
19610 +               case PCI_DEVICE_ID_INTEL_82801CA_12:
19611 +               case PCI_DEVICE_ID_INTEL_82801DB_0:
19612 +               case PCI_DEVICE_ID_INTEL_82801E_0:
19613 +               case PCI_DEVICE_ID_INTEL_82801EB_0:
19614 +               case PCI_DEVICE_ID_INTEL_ESB_1:
19615 +               case PCI_DEVICE_ID_INTEL_ICH6_0:
19616 +               case PCI_DEVICE_ID_INTEL_ICH6_1:
19617 +               case PCI_DEVICE_ID_INTEL_ICH7_0:
19618 +               case PCI_DEVICE_ID_INTEL_ICH7_1:
19619 +               case PCI_DEVICE_ID_INTEL_ICH7_30:
19620 +               case PCI_DEVICE_ID_INTEL_ICH7_31:
19621 +               case PCI_DEVICE_ID_INTEL_ESB2_0:
19622 +               case PCI_DEVICE_ID_INTEL_ICH8_0:
19623 +               case PCI_DEVICE_ID_INTEL_ICH8_1:
19624 +               case PCI_DEVICE_ID_INTEL_ICH8_2:
19625 +               case PCI_DEVICE_ID_INTEL_ICH8_3:
19626 +               case PCI_DEVICE_ID_INTEL_ICH8_4:
19627 +                       r->name = "PIIX/ICH";
19628 +                       r->get = pirq_piix_get;
19629 +                       r->set = pirq_piix_set;
19630 +                       return 1;
19631 +       }
19632 +       return 0;
19633 +}
19634 +
19635 +static __init int via_router_probe(struct irq_router *r,
19636 +                               struct pci_dev *router, u16 device)
19637 +{
19638 +       /* FIXME: We should move some of the quirk fixup stuff here */
19639 +
19640 +       /*
19641 +        * work arounds for some buggy BIOSes
19642 +        */
19643 +       if (device == PCI_DEVICE_ID_VIA_82C586_0) {
19644 +               switch(router->device) {
19645 +               case PCI_DEVICE_ID_VIA_82C686:
19646 +                       /*
19647 +                        * Asus k7m bios wrongly reports 82C686A
19648 +                        * as 586-compatible
19649 +                        */
19650 +                       device = PCI_DEVICE_ID_VIA_82C686;
19651 +                       break;
19652 +               case PCI_DEVICE_ID_VIA_8235:
19653 +                       /**
19654 +                        * Asus a7v-x bios wrongly reports 8235
19655 +                        * as 586-compatible
19656 +                        */
19657 +                       device = PCI_DEVICE_ID_VIA_8235;
19658 +                       break;
19659 +               }
19660 +       }
19661 +
19662 +       switch(device) {
19663 +       case PCI_DEVICE_ID_VIA_82C586_0:
19664 +               r->name = "VIA";
19665 +               r->get = pirq_via586_get;
19666 +               r->set = pirq_via586_set;
19667 +               return 1;
19668 +       case PCI_DEVICE_ID_VIA_82C596:
19669 +       case PCI_DEVICE_ID_VIA_82C686:
19670 +       case PCI_DEVICE_ID_VIA_8231:
19671 +       case PCI_DEVICE_ID_VIA_8235:
19672 +               /* FIXME: add new ones for 8233/5 */
19673 +               r->name = "VIA";
19674 +               r->get = pirq_via_get;
19675 +               r->set = pirq_via_set;
19676 +               return 1;
19677 +       }
19678 +       return 0;
19679 +}
19680 +
19681 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19682 +{
19683 +       switch(device)
19684 +       {
19685 +               case PCI_DEVICE_ID_VLSI_82C534:
19686 +                       r->name = "VLSI 82C534";
19687 +                       r->get = pirq_vlsi_get;
19688 +                       r->set = pirq_vlsi_set;
19689 +                       return 1;
19690 +       }
19691 +       return 0;
19692 +}
19693 +
19694 +
19695 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19696 +{
19697 +       switch(device)
19698 +       {
19699 +               case PCI_DEVICE_ID_SERVERWORKS_OSB4:
19700 +               case PCI_DEVICE_ID_SERVERWORKS_CSB5:
19701 +                       r->name = "ServerWorks";
19702 +                       r->get = pirq_serverworks_get;
19703 +                       r->set = pirq_serverworks_set;
19704 +                       return 1;
19705 +       }
19706 +       return 0;
19707 +}
19708 +
19709 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19710 +{
19711 +       if (device != PCI_DEVICE_ID_SI_503)
19712 +               return 0;
19713 +               
19714 +       r->name = "SIS";
19715 +       r->get = pirq_sis_get;
19716 +       r->set = pirq_sis_set;
19717 +       return 1;
19718 +}
19719 +
19720 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19721 +{
19722 +       switch(device)
19723 +       {
19724 +               case PCI_DEVICE_ID_CYRIX_5520:
19725 +                       r->name = "NatSemi";
19726 +                       r->get = pirq_cyrix_get;
19727 +                       r->set = pirq_cyrix_set;
19728 +                       return 1;
19729 +       }
19730 +       return 0;
19731 +}
19732 +
19733 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19734 +{
19735 +       switch(device)
19736 +       {
19737 +               case PCI_DEVICE_ID_OPTI_82C700:
19738 +                       r->name = "OPTI";
19739 +                       r->get = pirq_opti_get;
19740 +                       r->set = pirq_opti_set;
19741 +                       return 1;
19742 +       }
19743 +       return 0;
19744 +}
19745 +
19746 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19747 +{
19748 +       switch(device)
19749 +       {
19750 +               case PCI_DEVICE_ID_ITE_IT8330G_0:
19751 +                       r->name = "ITE";
19752 +                       r->get = pirq_ite_get;
19753 +                       r->set = pirq_ite_set;
19754 +                       return 1;
19755 +       }
19756 +       return 0;
19757 +}
19758 +
19759 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19760 +{
19761 +       switch(device)
19762 +       {
19763 +       case PCI_DEVICE_ID_AL_M1533:
19764 +       case PCI_DEVICE_ID_AL_M1563:
19765 +               printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
19766 +               r->name = "ALI";
19767 +               r->get = pirq_ali_get;
19768 +               r->set = pirq_ali_set;
19769 +               return 1;
19770 +       }
19771 +       return 0;
19772 +}
19773 +
19774 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19775 +{
19776 +       switch(device)
19777 +       {
19778 +               case PCI_DEVICE_ID_AMD_VIPER_740B:
19779 +                       r->name = "AMD756";
19780 +                       break;
19781 +               case PCI_DEVICE_ID_AMD_VIPER_7413:
19782 +                       r->name = "AMD766";
19783 +                       break;
19784 +               case PCI_DEVICE_ID_AMD_VIPER_7443:
19785 +                       r->name = "AMD768";
19786 +                       break;
19787 +               default:
19788 +                       return 0;
19789 +       }
19790 +       r->get = pirq_amd756_get;
19791 +       r->set = pirq_amd756_set;
19792 +       return 1;
19793 +}
19794 +               
19795 +static __initdata struct irq_router_handler pirq_routers[] = {
19796 +       { PCI_VENDOR_ID_INTEL, intel_router_probe },
19797 +       { PCI_VENDOR_ID_AL, ali_router_probe },
19798 +       { PCI_VENDOR_ID_ITE, ite_router_probe },
19799 +       { PCI_VENDOR_ID_VIA, via_router_probe },
19800 +       { PCI_VENDOR_ID_OPTI, opti_router_probe },
19801 +       { PCI_VENDOR_ID_SI, sis_router_probe },
19802 +       { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
19803 +       { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
19804 +       { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
19805 +       { PCI_VENDOR_ID_AMD, amd_router_probe },
19806 +       /* Someone with docs needs to add the ATI Radeon IGP */
19807 +       { 0, NULL }
19808 +};
19809 +static struct irq_router pirq_router;
19810 +static struct pci_dev *pirq_router_dev;
19811 +
19812 +
19813 +/*
19814 + *     FIXME: should we have an option to say "generic for
19815 + *     chipset" ?
19816 + */
19817
19818 +static void __init pirq_find_router(struct irq_router *r)
19819 +{
19820 +       struct irq_routing_table *rt = pirq_table;
19821 +       struct irq_router_handler *h;
19822 +
19823 +#ifdef CONFIG_PCI_BIOS
19824 +       if (!rt->signature) {
19825 +               printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
19826 +               r->set = pirq_bios_set;
19827 +               r->name = "BIOS";
19828 +               return;
19829 +       }
19830 +#endif
19831 +
19832 +       /* Default unless a driver reloads it */
19833 +       r->name = "default";
19834 +       r->get = NULL;
19835 +       r->set = NULL;
19836 +       
19837 +       DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
19838 +           rt->rtr_vendor, rt->rtr_device);
19839 +
19840 +       pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
19841 +       if (!pirq_router_dev) {
19842 +               DBG(KERN_DEBUG "PCI: Interrupt router not found at "
19843 +                       "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
19844 +               return;
19845 +       }
19846 +
19847 +       for( h = pirq_routers; h->vendor; h++) {
19848 +               /* First look for a router match */
19849 +               if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
19850 +                       break;
19851 +               /* Fall back to a device match */
19852 +               if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
19853 +                       break;
19854 +       }
19855 +       printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
19856 +               pirq_router.name,
19857 +               pirq_router_dev->vendor,
19858 +               pirq_router_dev->device,
19859 +               pci_name(pirq_router_dev));
19860 +}
19861 +
19862 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
19863 +{
19864 +       struct irq_routing_table *rt = pirq_table;
19865 +       int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
19866 +       struct irq_info *info;
19867 +
19868 +       for (info = rt->slots; entries--; info++)
19869 +               if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
19870 +                       return info;
19871 +       return NULL;
19872 +}
19873 +
19874 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
19875 +{
19876 +       u8 pin;
19877 +       struct irq_info *info;
19878 +       int i, pirq, newirq;
19879 +       int irq = 0;
19880 +       u32 mask;
19881 +       struct irq_router *r = &pirq_router;
19882 +       struct pci_dev *dev2 = NULL;
19883 +       char *msg = NULL;
19884 +
19885 +       /* Find IRQ pin */
19886 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19887 +       if (!pin) {
19888 +               DBG(KERN_DEBUG " -> no interrupt pin\n");
19889 +               return 0;
19890 +       }
19891 +       pin = pin - 1;
19892 +
19893 +       /* Find IRQ routing entry */
19894 +
19895 +       if (!pirq_table)
19896 +               return 0;
19897 +       
19898 +       DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
19899 +       info = pirq_get_info(dev);
19900 +       if (!info) {
19901 +               DBG(" -> not found in routing table\n" KERN_DEBUG);
19902 +               return 0;
19903 +       }
19904 +       pirq = info->irq[pin].link;
19905 +       mask = info->irq[pin].bitmap;
19906 +       if (!pirq) {
19907 +               DBG(" -> not routed\n" KERN_DEBUG);
19908 +               return 0;
19909 +       }
19910 +       DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
19911 +       mask &= pcibios_irq_mask;
19912 +
19913 +       /* Work around broken HP Pavilion Notebooks which assign USB to
19914 +          IRQ 9 even though it is actually wired to IRQ 11 */
19915 +
19916 +       if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
19917 +               dev->irq = 11;
19918 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
19919 +               r->set(pirq_router_dev, dev, pirq, 11);
19920 +       }
19921 +
19922 +       /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
19923 +       if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
19924 +               pirq = 0x68;
19925 +               mask = 0x400;
19926 +               dev->irq = r->get(pirq_router_dev, dev, pirq);
19927 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
19928 +       }
19929 +
19930 +       /*
19931 +        * Find the best IRQ to assign: use the one
19932 +        * reported by the device if possible.
19933 +        */
19934 +       newirq = dev->irq;
19935 +       if (newirq && !((1 << newirq) & mask)) {
19936 +               if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
19937 +               else printk("\n" KERN_WARNING
19938 +                       "PCI: IRQ %i for device %s doesn't match PIRQ mask "
19939 +                       "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
19940 +                       pci_name(dev));
19941 +       }
19942 +       if (!newirq && assign) {
19943 +               for (i = 0; i < 16; i++) {
19944 +                       if (!(mask & (1 << i)))
19945 +                               continue;
19946 +                       if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ))
19947 +                               newirq = i;
19948 +               }
19949 +       }
19950 +       DBG(" -> newirq=%d", newirq);
19951 +
19952 +       /* Check if it is hardcoded */
19953 +       if ((pirq & 0xf0) == 0xf0) {
19954 +               irq = pirq & 0xf;
19955 +               DBG(" -> hardcoded IRQ %d\n", irq);
19956 +               msg = "Hardcoded";
19957 +       } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
19958 +       ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
19959 +               DBG(" -> got IRQ %d\n", irq);
19960 +               msg = "Found";
19961 +       } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
19962 +               DBG(" -> assigning IRQ %d", newirq);
19963 +               if (r->set(pirq_router_dev, dev, pirq, newirq)) {
19964 +                       eisa_set_level_irq(newirq);
19965 +                       DBG(" ... OK\n");
19966 +                       msg = "Assigned";
19967 +                       irq = newirq;
19968 +               }
19969 +       }
19970 +
19971 +       if (!irq) {
19972 +               DBG(" ... failed\n");
19973 +               if (newirq && mask == (1 << newirq)) {
19974 +                       msg = "Guessed";
19975 +                       irq = newirq;
19976 +               } else
19977 +                       return 0;
19978 +       }
19979 +       printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
19980 +
19981 +       /* Update IRQ for all devices with the same pirq value */
19982 +       while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
19983 +               pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
19984 +               if (!pin)
19985 +                       continue;
19986 +               pin--;
19987 +               info = pirq_get_info(dev2);
19988 +               if (!info)
19989 +                       continue;
19990 +               if (info->irq[pin].link == pirq) {
19991 +                       /* We refuse to override the dev->irq information. Give a warning! */
19992 +                       if ( dev2->irq && dev2->irq != irq && \
19993 +                       (!(pci_probe & PCI_USE_PIRQ_MASK) || \
19994 +                       ((1 << dev2->irq) & mask)) ) {
19995 +#ifndef CONFIG_PCI_MSI
19996 +                               printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
19997 +                                      pci_name(dev2), dev2->irq, irq);
19998 +#endif
19999 +                               continue;
20000 +                       }
20001 +                       dev2->irq = irq;
20002 +                       pirq_penalty[irq]++;
20003 +                       if (dev != dev2)
20004 +                               printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
20005 +               }
20006 +       }
20007 +       return 1;
20008 +}
20009 +
20010 +static void __init pcibios_fixup_irqs(void)
20011 +{
20012 +       struct pci_dev *dev = NULL;
20013 +       u8 pin;
20014 +
20015 +       DBG(KERN_DEBUG "PCI: IRQ fixup\n");
20016 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
20017 +               /*
20018 +                * If the BIOS has set an out of range IRQ number, just ignore it.
20019 +                * Also keep track of which IRQ's are already in use.
20020 +                */
20021 +               if (dev->irq >= 16) {
20022 +                       DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
20023 +                       dev->irq = 0;
20024 +               }
20025 +               /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
20026 +               if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
20027 +                       pirq_penalty[dev->irq] = 0;
20028 +               pirq_penalty[dev->irq]++;
20029 +       }
20030 +
20031 +       dev = NULL;
20032 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
20033 +               pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
20034 +#ifdef CONFIG_X86_IO_APIC
20035 +               /*
20036 +                * Recalculate IRQ numbers if we use the I/O APIC.
20037 +                */
20038 +               if (io_apic_assign_pci_irqs)
20039 +               {
20040 +                       int irq;
20041 +
20042 +                       if (pin) {
20043 +                               pin--;          /* interrupt pins are numbered starting from 1 */
20044 +                               irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
20045 +       /*
20046 +        * Busses behind bridges are typically not listed in the MP-table.
20047 +        * In this case we have to look up the IRQ based on the parent bus,
20048 +        * parent slot, and pin number. The SMP code detects such bridged
20049 +        * busses itself so we should get into this branch reliably.
20050 +        */
20051 +                               if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
20052 +                                       struct pci_dev * bridge = dev->bus->self;
20053 +
20054 +                                       pin = (pin + PCI_SLOT(dev->devfn)) % 4;
20055 +                                       irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
20056 +                                                       PCI_SLOT(bridge->devfn), pin);
20057 +                                       if (irq >= 0)
20058 +                                               printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
20059 +                                                       pci_name(bridge), 'A' + pin, irq);
20060 +                               }
20061 +                               if (irq >= 0) {
20062 +                                       if (use_pci_vector() &&
20063 +                                               !platform_legacy_irq(irq))
20064 +                                               irq = IO_APIC_VECTOR(irq);
20065 +
20066 +                                       printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
20067 +                                               pci_name(dev), 'A' + pin, irq);
20068 +                                       dev->irq = irq;
20069 +                               }
20070 +                       }
20071 +               }
20072 +#endif
20073 +               /*
20074 +                * Still no IRQ? Try to lookup one...
20075 +                */
20076 +               if (pin && !dev->irq)
20077 +                       pcibios_lookup_irq(dev, 0);
20078 +       }
20079 +}
20080 +
20081 +/*
20082 + * Work around broken HP Pavilion Notebooks which assign USB to
20083 + * IRQ 9 even though it is actually wired to IRQ 11
20084 + */
20085 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
20086 +{
20087 +       if (!broken_hp_bios_irq9) {
20088 +               broken_hp_bios_irq9 = 1;
20089 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
20090 +       }
20091 +       return 0;
20092 +}
20093 +
20094 +/*
20095 + * Work around broken Acer TravelMate 360 Notebooks which assign
20096 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
20097 + */
20098 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
20099 +{
20100 +       if (!acer_tm360_irqrouting) {
20101 +               acer_tm360_irqrouting = 1;
20102 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
20103 +       }
20104 +       return 0;
20105 +}
20106 +
20107 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
20108 +       {
20109 +               .callback = fix_broken_hp_bios_irq9,
20110 +               .ident = "HP Pavilion N5400 Series Laptop",
20111 +               .matches = {
20112 +                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
20113 +                       DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
20114 +                       DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
20115 +                       DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
20116 +               },
20117 +       },
20118 +       {
20119 +               .callback = fix_acer_tm360_irqrouting,
20120 +               .ident = "Acer TravelMate 36x Laptop",
20121 +               .matches = {
20122 +                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
20123 +                       DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
20124 +               },
20125 +       },
20126 +       { }
20127 +};
20128 +
20129 +static int __init pcibios_irq_init(void)
20130 +{
20131 +       DBG(KERN_DEBUG "PCI: IRQ init\n");
20132 +
20133 +       if (pcibios_enable_irq || raw_pci_ops == NULL)
20134 +               return 0;
20135 +
20136 +       dmi_check_system(pciirq_dmi_table);
20137 +
20138 +       pirq_table = pirq_find_routing_table();
20139 +
20140 +#ifdef CONFIG_PCI_BIOS
20141 +       if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
20142 +               pirq_table = pcibios_get_irq_routing_table();
20143 +#endif
20144 +       if (pirq_table) {
20145 +               pirq_peer_trick();
20146 +               pirq_find_router(&pirq_router);
20147 +               if (pirq_table->exclusive_irqs) {
20148 +                       int i;
20149 +                       for (i=0; i<16; i++)
20150 +                               if (!(pirq_table->exclusive_irqs & (1 << i)))
20151 +                                       pirq_penalty[i] += 100;
20152 +               }
20153 +               /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
20154 +               if (io_apic_assign_pci_irqs)
20155 +                       pirq_table = NULL;
20156 +       }
20157 +
20158 +       pcibios_enable_irq = pirq_enable_irq;
20159 +
20160 +       pcibios_fixup_irqs();
20161 +       return 0;
20162 +}
20163 +
20164 +subsys_initcall(pcibios_irq_init);
20165 +
20166 +
20167 +static void pirq_penalize_isa_irq(int irq, int active)
20168 +{
20169 +       /*
20170 +        *  If any ISAPnP device reports an IRQ in its list of possible
20171 +        *  IRQ's, we try to avoid assigning it to PCI devices.
20172 +        */
20173 +       if (irq < 16) {
20174 +               if (active)
20175 +                       pirq_penalty[irq] += 1000;
20176 +               else
20177 +                       pirq_penalty[irq] += 100;
20178 +       }
20179 +}
20180 +
20181 +void pcibios_penalize_isa_irq(int irq, int active)
20182 +{
20183 +#ifdef CONFIG_ACPI
20184 +       if (!acpi_noirq)
20185 +               acpi_penalize_isa_irq(irq, active);
20186 +       else
20187 +#endif
20188 +               pirq_penalize_isa_irq(irq, active);
20189 +}
20190 +
20191 +static int pirq_enable_irq(struct pci_dev *dev)
20192 +{
20193 +       u8 pin;
20194 +       struct pci_dev *temp_dev;
20195 +
20196 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
20197 +       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
20198 +               char *msg = "";
20199 +
20200 +               pin--;          /* interrupt pins are numbered starting from 1 */
20201 +
20202 +               if (io_apic_assign_pci_irqs) {
20203 +                       int irq;
20204 +
20205 +                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
20206 +                       /*
20207 +                        * Busses behind bridges are typically not listed in the MP-table.
20208 +                        * In this case we have to look up the IRQ based on the parent bus,
20209 +                        * parent slot, and pin number. The SMP code detects such bridged
20210 +                        * busses itself so we should get into this branch reliably.
20211 +                        */
20212 +                       temp_dev = dev;
20213 +                       while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
20214 +                               struct pci_dev * bridge = dev->bus->self;
20215 +
20216 +                               pin = (pin + PCI_SLOT(dev->devfn)) % 4;
20217 +                               irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
20218 +                                               PCI_SLOT(bridge->devfn), pin);
20219 +                               if (irq >= 0)
20220 +                                       printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
20221 +                                               pci_name(bridge), 'A' + pin, irq);
20222 +                               dev = bridge;
20223 +                       }
20224 +                       dev = temp_dev;
20225 +                       if (irq >= 0) {
20226 +#ifdef CONFIG_PCI_MSI
20227 +                               if (!platform_legacy_irq(irq))
20228 +                                       irq = IO_APIC_VECTOR(irq);
20229 +#endif
20230 +                               printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
20231 +                                       pci_name(dev), 'A' + pin, irq);
20232 +                               dev->irq = irq;
20233 +                               return 0;
20234 +                       } else
20235 +                               msg = " Probably buggy MP table.";
20236 +               } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
20237 +                       msg = "";
20238 +               else
20239 +                       msg = " Please try using pci=biosirq.";
20240 +
20241 +               /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
20242 +               if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
20243 +                       return 0;
20244 +
20245 +               printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
20246 +                      'A' + pin, pci_name(dev), msg);
20247 +       }
20248 +       return 0;
20249 +}
20250 +
20251 +int pci_vector_resources(int last, int nr_released)
20252 +{
20253 +       int count = nr_released;
20254 +
20255 +       int next = last;
20256 +       int offset = (last % 8);
20257 +
20258 +       while (next < FIRST_SYSTEM_VECTOR) {
20259 +               next += 8;
20260 +#ifdef CONFIG_X86_64
20261 +               if (next == IA32_SYSCALL_VECTOR)
20262 +                       continue;
20263 +#else
20264 +               if (next == SYSCALL_VECTOR)
20265 +                       continue;
20266 +#endif
20267 +               count++;
20268 +               if (next >= FIRST_SYSTEM_VECTOR) {
20269 +                       if (offset%8) {
20270 +                               next = FIRST_DEVICE_VECTOR + offset;
20271 +                               offset++;
20272 +                               continue;
20273 +                       }
20274 +                       count--;
20275 +               }
20276 +       }
20277 +
20278 +       return count;
20279 +}
20280 diff -Nurp ref-linux-2.6.16.9/arch/i386/pci/Makefile tmp-linux-2.6-xen.patch/arch/i386/pci/Makefile
20281 --- ref-linux-2.6.16.9/arch/i386/pci/Makefile   2006-04-19 08:10:14.000000000 +0200
20282 +++ tmp-linux-2.6-xen.patch/arch/i386/pci/Makefile      2006-04-10 00:05:52.000000000 +0200
20283 @@ -4,6 +4,10 @@ obj-$(CONFIG_PCI_BIOS)         += pcbios.o
20284  obj-$(CONFIG_PCI_MMCONFIG)     += mmconfig.o direct.o
20285  obj-$(CONFIG_PCI_DIRECT)       += direct.o
20286  
20287 +# pcifront should be after pcbios.o, mmconfig.o, and direct.o as it should only
20288 +# take over if direct access to the PCI bus is unavailable
20289 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
20290 +
20291  pci-y                          := fixup.o
20292  pci-$(CONFIG_ACPI)             += acpi.o
20293  pci-y                          += legacy.o irq.o
20294 @@ -12,3 +16,8 @@ pci-$(CONFIG_X86_VISWS)               := visws.o fixu
20295  pci-$(CONFIG_X86_NUMAQ)                := numa.o irq.o
20296  
20297  obj-y                          += $(pci-y) common.o
20298 +
20299 +ifdef CONFIG_XEN
20300 +include $(srctree)/scripts/Makefile.xen
20301 +obj-y := $(call cherrypickxen, $(obj-y))
20302 +endif
20303 diff -Nurp ref-linux-2.6.16.9/arch/i386/pci/pcifront.c tmp-linux-2.6-xen.patch/arch/i386/pci/pcifront.c
20304 --- ref-linux-2.6.16.9/arch/i386/pci/pcifront.c 1970-01-01 01:00:00.000000000 +0100
20305 +++ tmp-linux-2.6-xen.patch/arch/i386/pci/pcifront.c    2006-04-10 00:05:52.000000000 +0200
20306 @@ -0,0 +1,55 @@
20307 +/*
20308 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
20309 + *                     to support the Xen PCI Frontend's operation
20310 + *
20311 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
20312 + */
20313 +#include <linux/module.h>
20314 +#include <linux/init.h>
20315 +#include <linux/pci.h>
20316 +#include <asm/acpi.h>
20317 +#include "pci.h"
20318 +
20319 +static int pcifront_enable_irq(struct pci_dev *dev)
20320 +{
20321 +       u8 irq;
20322 +       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
20323 +       dev->irq = irq;
20324 +
20325 +       return 0;
20326 +}
20327 +
20328 +extern u8 pci_cache_line_size;
20329 +
20330 +static int __init pcifront_x86_stub_init(void)
20331 +{
20332 +       struct cpuinfo_x86 *c = &boot_cpu_data;
20333 +
20334 +       /* Only install our method if we haven't found real hardware already */
20335 +       if (raw_pci_ops)
20336 +               return 0;
20337 +
20338 +       printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
20339 +
20340 +       /* Copied from arch/i386/pci/common.c */
20341 +       pci_cache_line_size = 32 >> 2;
20342 +       if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
20343 +               pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
20344 +       else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
20345 +               pci_cache_line_size = 128 >> 2; /* P4 */
20346 +
20347 +       /* On x86, we need to disable the normal IRQ routing table and
20348 +        * just ask the backend
20349 +        */
20350 +       pcibios_enable_irq = pcifront_enable_irq;
20351 +       pcibios_disable_irq = NULL;
20352 +
20353 +#ifdef CONFIG_ACPI
20354 +       /* Keep ACPI out of the picture */
20355 +       acpi_noirq = 1;
20356 +#endif
20357 +
20358 +       return 0;
20359 +}
20360 +
20361 +arch_initcall(pcifront_x86_stub_init);
20362 diff -Nurp ref-linux-2.6.16.9/arch/i386/power/Makefile tmp-linux-2.6-xen.patch/arch/i386/power/Makefile
20363 --- ref-linux-2.6.16.9/arch/i386/power/Makefile 2006-04-19 08:10:14.000000000 +0200
20364 +++ tmp-linux-2.6-xen.patch/arch/i386/power/Makefile    2006-04-10 00:05:52.000000000 +0200
20365 @@ -1,2 +1,4 @@
20366 -obj-$(CONFIG_PM)               += cpu.o
20367 +obj-$(CONFIG_PM_LEGACY)                += cpu.o
20368 +obj-$(CONFIG_SOFTWARE_SUSPEND) += cpu.o
20369 +obj-$(CONFIG_ACPI_SLEEP)       += cpu.o
20370  obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o
20371 diff -Nurp ref-linux-2.6.16.9/arch/ia64/hp/sim/Makefile tmp-linux-2.6-xen.patch/arch/ia64/hp/sim/Makefile
20372 --- ref-linux-2.6.16.9/arch/ia64/hp/sim/Makefile        2006-04-19 08:10:14.000000000 +0200
20373 +++ tmp-linux-2.6-xen.patch/arch/ia64/hp/sim/Makefile   2006-04-10 00:05:52.000000000 +0200
20374 @@ -14,3 +14,5 @@ obj-$(CONFIG_HP_SIMETH)       += simeth.o
20375  obj-$(CONFIG_HP_SIMSERIAL) += simserial.o
20376  obj-$(CONFIG_HP_SIMSERIAL_CONSOLE) += hpsim_console.o
20377  obj-$(CONFIG_HP_SIMSCSI) += simscsi.o
20378 +obj-$(CONFIG_XEN) += simserial.o
20379 +obj-$(CONFIG_XEN) += hpsim_console.o
20380 diff -Nurp ref-linux-2.6.16.9/arch/ia64/Kconfig tmp-linux-2.6-xen.patch/arch/ia64/Kconfig
20381 --- ref-linux-2.6.16.9/arch/ia64/Kconfig        2006-04-19 08:10:14.000000000 +0200
20382 +++ tmp-linux-2.6-xen.patch/arch/ia64/Kconfig   2006-04-10 00:05:52.000000000 +0200
20383 @@ -50,6 +50,52 @@ config GENERIC_IOMAP
20384         bool
20385         default y
20386  
20387 +config XEN
20388 +       bool
20389 +       default y
20390 +       help
20391 +         Enable Xen hypervisor support.  Resulting kernel runs
20392 +         both as a guest OS on Xen and natively on hardware.
20393 +
20394 +config ARCH_XEN
20395 +       bool
20396 +       default y
20397 +       help
20398 +         TEMP ONLY. Needs to be on for drivers/xen to build.
20399 +
20400 +config XEN_PRIVILEGED_GUEST
20401 +       bool "Privileged Guest"
20402 +       default n
20403 +       help
20404 +         Used in drivers/xen/privcmd.c.  Should go away?
20405 +
20406 +config XEN_BLKDEV_GRANT
20407 +       depends on XEN
20408 +       bool
20409 +       default y
20410 +
20411 +config XEN_BLKDEV_FRONTEND
20412 +       depends on XEN
20413 +       bool
20414 +       default y
20415 +
20416 +config XEN_BLKDEV_BACKEND
20417 +       depends on XEN
20418 +       bool
20419 +       default y
20420 +
20421 +config XEN_SYSFS
20422 +       bool "Export Xen attributes in sysfs"
20423 +       depends on XEN && SYSFS
20424 +       default y
20425 +       help
20426 +               Xen hypervisor attributes will show up under /sys/hypervisor/.
20427 +
20428 +config XEN_INTERFACE_VERSION
20429 +       hex
20430 +       depends on XEN
20431 +       default 0x00030101
20432 +
20433  config SCHED_NO_NO_OMIT_FRAME_POINTER
20434         bool
20435         default y
20436 diff -Nurp ref-linux-2.6.16.9/arch/ia64/kernel/entry.S tmp-linux-2.6-xen.patch/arch/ia64/kernel/entry.S
20437 --- ref-linux-2.6.16.9/arch/ia64/kernel/entry.S 2006-04-19 08:10:14.000000000 +0200
20438 +++ tmp-linux-2.6-xen.patch/arch/ia64/kernel/entry.S    2006-04-10 00:05:52.000000000 +0200
20439 @@ -181,7 +181,7 @@ END(sys_clone)
20440   *     called.  The code starting at .map relies on this.  The rest of the code
20441   *     doesn't care about the interrupt masking status.
20442   */
20443 -GLOBAL_ENTRY(ia64_switch_to)
20444 +GLOBAL_ENTRY(__ia64_switch_to)
20445         .prologue
20446         alloc r16=ar.pfs,1,0,0,0
20447         DO_SAVE_SWITCH_STACK
20448 @@ -235,7 +235,7 @@ GLOBAL_ENTRY(ia64_switch_to)
20449         ;;
20450         srlz.d
20451         br.cond.sptk .done
20452 -END(ia64_switch_to)
20453 +END(__ia64_switch_to)
20454  
20455  /*
20456   * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
20457 @@ -376,7 +376,7 @@ END(save_switch_stack)
20458   *     - b7 holds address to return to
20459   *     - must not touch r8-r11
20460   */
20461 -ENTRY(load_switch_stack)
20462 +GLOBAL_ENTRY(load_switch_stack)
20463         .prologue
20464         .altrp b7
20465  
20466 @@ -511,7 +511,7 @@ END(clone)
20467          * because some system calls (such as ia64_execve) directly
20468          * manipulate ar.pfs.
20469          */
20470 -GLOBAL_ENTRY(ia64_trace_syscall)
20471 +GLOBAL_ENTRY(__ia64_trace_syscall)
20472         PT_REGS_UNWIND_INFO(0)
20473         /*
20474          * We need to preserve the scratch registers f6-f11 in case the system
20475 @@ -583,7 +583,7 @@ strace_error:
20476  (p6)   mov r10=-1
20477  (p6)   mov r8=r9
20478         br.cond.sptk .strace_save_retval
20479 -END(ia64_trace_syscall)
20480 +END(__ia64_trace_syscall)
20481  
20482         /*
20483          * When traced and returning from sigreturn, we invoke syscall_trace but then
20484 @@ -636,8 +636,11 @@ GLOBAL_ENTRY(ia64_ret_from_syscall)
20485         adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
20486         mov r10=r0                              // clear error indication in r10
20487  (p7)   br.cond.spnt handle_syscall_error       // handle potential syscall failure
20488 +       ;;
20489 +       // don't fall through, ia64_leave_syscall may be #define'd
20490 +       br.cond.sptk.few ia64_leave_syscall
20491 +       ;;
20492  END(ia64_ret_from_syscall)
20493 -       // fall through
20494  /*
20495   * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
20496   *     need to switch to bank 0 and doesn't restore the scratch registers.
20497 @@ -682,7 +685,7 @@ END(ia64_ret_from_syscall)
20498   *           ar.csd: cleared
20499   *           ar.ssd: cleared
20500   */
20501 -ENTRY(ia64_leave_syscall)
20502 +GLOBAL_ENTRY(__ia64_leave_syscall)
20503         PT_REGS_UNWIND_INFO(0)
20504         /*
20505          * work.need_resched etc. mustn't get changed by this CPU before it returns to
20506 @@ -790,7 +793,7 @@ ENTRY(ia64_leave_syscall)
20507         mov.m ar.ssd=r0                 // M2   clear ar.ssd
20508         mov f11=f0                      // F    clear f11
20509         br.cond.sptk.many rbs_switch    // B
20510 -END(ia64_leave_syscall)
20511 +END(__ia64_leave_syscall)
20512  
20513  #ifdef CONFIG_IA32_SUPPORT
20514  GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
20515 @@ -802,10 +805,13 @@ GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
20516         st8.spill [r2]=r8       // store return value in slot for r8 and set unat bit
20517         .mem.offset 8,0
20518         st8.spill [r3]=r0       // clear error indication in slot for r10 and set unat bit
20519 +       ;;
20520 +       // don't fall through, ia64_leave_kernel may be #define'd
20521 +       br.cond.sptk.few ia64_leave_kernel
20522 +       ;;
20523  END(ia64_ret_from_ia32_execve)
20524 -       // fall through
20525  #endif /* CONFIG_IA32_SUPPORT */
20526 -GLOBAL_ENTRY(ia64_leave_kernel)
20527 +GLOBAL_ENTRY(__ia64_leave_kernel)
20528         PT_REGS_UNWIND_INFO(0)
20529         /*
20530          * work.need_resched etc. mustn't get changed by this CPU before it returns to
20531 @@ -1150,7 +1156,7 @@ skip_rbs_switch:
20532         ld8 r10=[r3]
20533         br.cond.sptk.many .work_processed_syscall       // re-check
20534  
20535 -END(ia64_leave_kernel)
20536 +END(__ia64_leave_kernel)
20537  
20538  ENTRY(handle_syscall_error)
20539         /*
20540 @@ -1190,7 +1196,7 @@ END(ia64_invoke_schedule_tail)
20541          * be set up by the caller.  We declare 8 input registers so the system call
20542          * args get preserved, in case we need to restart a system call.
20543          */
20544 -ENTRY(notify_resume_user)
20545 +GLOBAL_ENTRY(notify_resume_user)
20546         .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
20547         alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
20548         mov r9=ar.unat
20549 @@ -1278,7 +1284,7 @@ ENTRY(sys_rt_sigreturn)
20550         adds sp=16,sp
20551         ;;
20552         ld8 r9=[sp]                             // load new ar.unat
20553 -       mov.sptk b7=r8,ia64_leave_kernel
20554 +       mov.sptk b7=r8,__ia64_leave_kernel
20555         ;;
20556         mov ar.unat=r9
20557         br.many b7
20558 diff -Nurp ref-linux-2.6.16.9/arch/ia64/kernel/head.S tmp-linux-2.6-xen.patch/arch/ia64/kernel/head.S
20559 --- ref-linux-2.6.16.9/arch/ia64/kernel/head.S  2006-04-19 08:10:14.000000000 +0200
20560 +++ tmp-linux-2.6-xen.patch/arch/ia64/kernel/head.S     2006-04-10 00:05:52.000000000 +0200
20561 @@ -363,6 +363,12 @@ start_ap:
20562         ;;
20563  (isBP) st8 [r2]=r28            // save the address of the boot param area passed by the bootloader
20564  
20565 +#ifdef CONFIG_XEN
20566 +       //  Note: isBP is used by the subprogram.
20567 +       br.call.sptk.many rp=early_xen_setup
20568 +       ;;
20569 +#endif
20570 +
20571  #ifdef CONFIG_SMP
20572  (isAP) br.call.sptk.many rp=start_secondary
20573  .ret0:
20574 diff -Nurp ref-linux-2.6.16.9/arch/ia64/kernel/pal.S tmp-linux-2.6-xen.patch/arch/ia64/kernel/pal.S
20575 --- ref-linux-2.6.16.9/arch/ia64/kernel/pal.S   2006-04-19 08:10:14.000000000 +0200
20576 +++ tmp-linux-2.6-xen.patch/arch/ia64/kernel/pal.S      2006-04-10 00:05:52.000000000 +0200
20577 @@ -16,6 +16,7 @@
20578  #include <asm/processor.h>
20579  
20580         .data
20581 +       .globl pal_entry_point
20582  pal_entry_point:
20583         data8 ia64_pal_default_handler
20584         .text
20585 @@ -53,7 +54,7 @@ END(ia64_pal_default_handler)
20586   * in4        1 ==> clear psr.ic,  0 ==> don't clear psr.ic
20587   *
20588   */
20589 -GLOBAL_ENTRY(ia64_pal_call_static)
20590 +GLOBAL_ENTRY(__ia64_pal_call_static)
20591         .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
20592         alloc loc1 = ar.pfs,5,5,0,0
20593         movl loc2 = pal_entry_point
20594 @@ -90,7 +91,7 @@ GLOBAL_ENTRY(ia64_pal_call_static)
20595         ;;
20596         srlz.d                          // seralize restoration of psr.l
20597         br.ret.sptk.many b0
20598 -END(ia64_pal_call_static)
20599 +END(__ia64_pal_call_static)
20600  
20601  /*
20602   * Make a PAL call using the stacked registers calling convention.
20603 diff -Nurp ref-linux-2.6.16.9/arch/ia64/kernel/sal.c tmp-linux-2.6-xen.patch/arch/ia64/kernel/sal.c
20604 --- ref-linux-2.6.16.9/arch/ia64/kernel/sal.c   2006-04-19 08:10:14.000000000 +0200
20605 +++ tmp-linux-2.6-xen.patch/arch/ia64/kernel/sal.c      2006-04-10 00:05:52.000000000 +0200
20606 @@ -336,6 +336,9 @@ ia64_sal_init (struct ia64_sal_systab *s
20607                 p += SAL_DESC_SIZE(*p);
20608         }
20609  
20610 +#ifdef CONFIG_XEN
20611 +       if (!running_on_xen)
20612 +#endif
20613         check_sal_cache_flush();
20614  }
20615  
20616 diff -Nurp ref-linux-2.6.16.9/arch/ia64/kernel/setup.c tmp-linux-2.6-xen.patch/arch/ia64/kernel/setup.c
20617 --- ref-linux-2.6.16.9/arch/ia64/kernel/setup.c 2006-04-19 08:10:14.000000000 +0200
20618 +++ tmp-linux-2.6-xen.patch/arch/ia64/kernel/setup.c    2006-04-10 00:05:52.000000000 +0200
20619 @@ -61,6 +61,9 @@
20620  #include <asm/system.h>
20621  #include <asm/unistd.h>
20622  #include <asm/system.h>
20623 +#ifdef CONFIG_XEN
20624 +#include <asm/hypervisor.h>
20625 +#endif
20626  
20627  #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
20628  # error "struct cpuinfo_ia64 too big!"
20629 @@ -243,6 +246,14 @@ reserve_memory (void)
20630         rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
20631         n++;
20632  
20633 +#ifdef CONFIG_XEN
20634 +       if (running_on_xen) {
20635 +               rsvd_region[n].start = (unsigned long)__va((HYPERVISOR_shared_info->arch.start_info_pfn << PAGE_SHIFT));
20636 +               rsvd_region[n].end   = rsvd_region[n].start + PAGE_SIZE;
20637 +               n++;
20638 +       }
20639 +#endif
20640 +
20641  #ifdef CONFIG_BLK_DEV_INITRD
20642         if (ia64_boot_param->initrd_start) {
20643                 rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
20644 @@ -260,6 +271,7 @@ reserve_memory (void)
20645         n++;
20646  
20647         num_rsvd_regions = n;
20648 +       BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
20649  
20650         sort_regions(rsvd_region, num_rsvd_regions);
20651  }
20652 @@ -333,6 +345,10 @@ early_console_setup (char *cmdline)
20653  {
20654         int earlycons = 0;
20655  
20656 +#ifdef CONFIG_XEN
20657 +       if (!early_xen_console_setup(cmdline))
20658 +               earlycons++;
20659 +#endif
20660  #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
20661         {
20662                 extern int sn_serial_console_early_setup(void);
20663 @@ -490,6 +506,22 @@ setup_arch (char **cmdline_p)
20664                         conswitchp = &vga_con;
20665  # endif
20666         }
20667 +#ifdef CONFIG_XEN
20668 +       if (running_on_xen) {
20669 +               extern shared_info_t *HYPERVISOR_shared_info;
20670 +
20671 +               /* xen_start_info isn't setup yet, get the flags manually */
20672 +               if (HYPERVISOR_shared_info->arch.flags & SIF_INITDOMAIN) {
20673 +                       if (!(HYPERVISOR_shared_info->arch.flags & SIF_PRIVILEGED))
20674 +                               panic("Xen granted us console access "
20675 +                                     "but not privileged status");
20676 +               } else {
20677 +                       extern int console_use_vt;
20678 +                       conswitchp = NULL;
20679 +                       console_use_vt = 0;
20680 +               }
20681 +       }
20682 +#endif
20683  #endif
20684  
20685         /* enable IA-64 Machine Check Abort Handling unless disabled */
20686 diff -Nurp ref-linux-2.6.16.9/arch/ia64/Makefile tmp-linux-2.6-xen.patch/arch/ia64/Makefile
20687 --- ref-linux-2.6.16.9/arch/ia64/Makefile       2006-04-19 08:10:14.000000000 +0200
20688 +++ tmp-linux-2.6-xen.patch/arch/ia64/Makefile  2006-04-10 00:05:52.000000000 +0200
20689 @@ -42,6 +42,12 @@ ifeq ($(call cc-version),0304)
20690  endif
20691  
20692  CFLAGS += $(cflags-y)
20693 +
20694 +cppflags-$(CONFIG_XEN) += \
20695 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
20696 +
20697 +CPPFLAGS += $(cppflags-y)
20698 +
20699  head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o
20700  
20701  libs-y                         += arch/ia64/lib/
20702 @@ -52,9 +58,15 @@ core-$(CONFIG_IA64_GENERIC)  += arch/ia6
20703  core-$(CONFIG_IA64_HP_ZX1)     += arch/ia64/dig/
20704  core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
20705  core-$(CONFIG_IA64_SGI_SN2)    += arch/ia64/sn/
20706 +core-$(CONFIG_XEN)             += arch/ia64/xen/
20707  
20708  drivers-$(CONFIG_PCI)          += arch/ia64/pci/
20709 +ifneq ($(CONFIG_XEN),y)
20710  drivers-$(CONFIG_IA64_HP_SIM)  += arch/ia64/hp/sim/
20711 +endif
20712 +ifneq ($(CONFIG_IA64_GENERIC),y)
20713 +drivers-$(CONFIG_XEN)          += arch/ia64/hp/sim/
20714 +endif
20715  drivers-$(CONFIG_IA64_HP_ZX1)  += arch/ia64/hp/common/ arch/ia64/hp/zx1/
20716  drivers-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
20717  drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/
20718 @@ -68,6 +80,8 @@ all: compressed unwcheck
20719  
20720  compressed: vmlinux.gz
20721  
20722 +vmlinuz: vmlinux.gz
20723 +
20724  vmlinux.gz: vmlinux
20725         $(Q)$(MAKE) $(build)=$(boot) $@
20726  
20727 @@ -82,7 +96,7 @@ CLEAN_FILES += vmlinux.gz bootloader
20728  boot:  lib/lib.a vmlinux
20729         $(Q)$(MAKE) $(build)=$(boot) $@
20730  
20731 -install: vmlinux.gz
20732 +install:
20733         sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) $< System.map "$(INSTALL_PATH)"
20734  
20735  define archhelp
20736 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/coreMakefile tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/coreMakefile
20737 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/coreMakefile       1970-01-01 01:00:00.000000000 +0100
20738 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/coreMakefile  2006-04-10 00:05:52.000000000 +0200
20739 @@ -0,0 +1,26 @@
20740 +#
20741 +# Makefile for the linux kernel.
20742 +#
20743 +
20744 +XENARCH        := $(subst ",,$(CONFIG_XENARCH))
20745 +
20746 +CPPFLAGS_vmlinux.lds += -U$(XENARCH)
20747 +
20748 +$(obj)/vmlinux.lds.S:
20749 +       @ln -fsn $(srctree)/arch/$(XENARCH)/kernel/vmlinux.lds.S $@
20750 +
20751 +
20752 +obj-y   := gnttab.o features.o
20753 +obj-$(CONFIG_PROC_FS) += xen_proc.o
20754 +
20755 +ifeq ($(ARCH),ia64)
20756 +obj-y   += evtchn_ia64.o
20757 +obj-y   += xenia64_init.o
20758 +else
20759 +extra-y += vmlinux.lds
20760 +obj-y   += reboot.o evtchn.o fixup.o 
20761 +obj-$(CONFIG_SMP)     += smp.o         # setup_profiling_timer def'd in ia64
20762 +obj-$(CONFIG_NET)     += skbuff.o      # until networking is up on ia64
20763 +endif
20764 +obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
20765 +obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
20766 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/evtchn_ia64.c tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/evtchn_ia64.c
20767 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/evtchn_ia64.c      1970-01-01 01:00:00.000000000 +0100
20768 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/evtchn_ia64.c 2006-04-10 00:05:52.000000000 +0200
20769 @@ -0,0 +1,273 @@
20770 +/* NOTE: This file split off from evtchn.c because there was
20771 +   some discussion that the mechanism is sufficiently different.
20772 +   It may be possible to merge it back in the future... djm */
20773 +#include <linux/config.h>
20774 +#include <linux/kernel.h>
20775 +#include <asm/hw_irq.h>
20776 +#include <xen/evtchn.h>
20777 +
20778 +#define MAX_EVTCHN 1024
20779 +
20780 +/* Xen will never allocate port zero for any purpose. */
20781 +#define VALID_EVTCHN(_chn) (((_chn) != 0) && ((_chn) < MAX_EVTCHN))
20782 +
20783 +/* Binding types. Hey, only IRQT_VIRQ and IRQT_EVTCHN are supported now
20784 + * for XEN/IA64 - ktian1
20785 + */
20786 +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
20787 +
20788 +/* Constructor for packed IRQ information. */
20789 +#define mk_irq_info(type, index, evtchn)                               \
20790 +       (((u32)(type) << 24) | ((u32)(index) << 16) | (u32)(evtchn))
20791 +/* Convenient shorthand for packed representation of an unbound IRQ. */
20792 +#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
20793 +/* Accessor macros for packed IRQ information. */
20794 +#define evtchn_from_irq(irq) ((u16)(irq_info[irq]))
20795 +#define index_from_irq(irq)  ((u8)(irq_info[irq] >> 16))
20796 +#define type_from_irq(irq)   ((u8)(irq_info[irq] >> 24))
20797 +
20798 +/* Packed IRQ information: binding type, sub-type index, and event channel. */
20799 +static u32 irq_info[NR_IRQS];
20800 +
20801 +/* One note for XEN/IA64 is that we have all event channels bound to one
20802 + * physical irq vector. So we always mean evtchn vector identical to 'irq'
20803 + * vector in this context. - ktian1
20804 + */
20805 +static struct {
20806 +       irqreturn_t (*handler)(int, void *, struct pt_regs *);
20807 +       void *dev_id;
20808 +       char opened;    /* Whether allocated */
20809 +} evtchns[MAX_EVTCHN];
20810 +
20811 +/*
20812 + * This lock protects updates to the following mapping and reference-count
20813 + * arrays. The lock does not need to be acquired to read the mapping tables.
20814 + */
20815 +static spinlock_t irq_mapping_update_lock;
20816 +
20817 +void mask_evtchn(int port)
20818 +{
20819 +       shared_info_t *s = HYPERVISOR_shared_info;
20820 +       synch_set_bit(port, &s->evtchn_mask[0]);
20821 +}
20822 +EXPORT_SYMBOL(mask_evtchn);
20823 +
20824 +void unmask_evtchn(int port)
20825 +{
20826 +       shared_info_t *s = HYPERVISOR_shared_info;
20827 +       unsigned int cpu = smp_processor_id();
20828 +       vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
20829 +
20830 +#if 0  // FIXME: diverged from x86 evtchn.c
20831 +       /* Slow path (hypercall) if this is a non-local port. */
20832 +       if (unlikely(cpu != cpu_from_evtchn(port))) {
20833 +               evtchn_op_t op = { .cmd = EVTCHNOP_unmask,
20834 +                                  .u.unmask.port = port };
20835 +               (void)HYPERVISOR_event_channel_op(&op);
20836 +               return;
20837 +       }
20838 +#endif
20839 +
20840 +       synch_clear_bit(port, &s->evtchn_mask[0]);
20841 +
20842 +       /*
20843 +        * The following is basically the equivalent of 'hw_resend_irq'. Just
20844 +        * like a real IO-APIC we 'lose the interrupt edge' if the channel is
20845 +        * masked.
20846 +        */
20847 +       if (synch_test_bit(port, &s->evtchn_pending[0]) && 
20848 +           !synch_test_and_set_bit(port / BITS_PER_LONG,
20849 +                                   &vcpu_info->evtchn_pending_sel)) {
20850 +               vcpu_info->evtchn_upcall_pending = 1;
20851 +               if (!vcpu_info->evtchn_upcall_mask)
20852 +                       force_evtchn_callback();
20853 +       }
20854 +}
20855 +EXPORT_SYMBOL(unmask_evtchn);
20856 +
20857 +
20858 +#define unbound_irq(e) (VALID_EVTCHN(e) && (!evtchns[(e)].opened))
20859 +int bind_virq_to_irqhandler(
20860 +       unsigned int virq,
20861 +       unsigned int cpu,
20862 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
20863 +       unsigned long irqflags,
20864 +       const char *devname,
20865 +       void *dev_id)
20866 +{
20867 +    evtchn_op_t op;
20868 +    int evtchn;
20869 +
20870 +    spin_lock(&irq_mapping_update_lock);
20871 +
20872 +    op.cmd = EVTCHNOP_bind_virq;
20873 +    op.u.bind_virq.virq = virq;
20874 +    op.u.bind_virq.vcpu = cpu;
20875 +    BUG_ON(HYPERVISOR_event_channel_op(&op) != 0 );
20876 +    evtchn = op.u.bind_virq.port;
20877 +
20878 +    if (!unbound_irq(evtchn)) {
20879 +        evtchn = -EINVAL;
20880 +        goto out;
20881 +    }
20882 +
20883 +    evtchns[evtchn].handler = handler;
20884 +    evtchns[evtchn].dev_id = dev_id;
20885 +    evtchns[evtchn].opened = 1;
20886 +    irq_info[evtchn] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
20887 +
20888 +    unmask_evtchn(evtchn);
20889 +out:
20890 +    spin_unlock(&irq_mapping_update_lock);
20891 +    return evtchn;
20892 +}
20893 +
20894 +int bind_evtchn_to_irqhandler(unsigned int evtchn,
20895 +                   irqreturn_t (*handler)(int, void *, struct pt_regs *),
20896 +                   unsigned long irqflags, const char * devname, void *dev_id)
20897 +{
20898 +    spin_lock(&irq_mapping_update_lock);
20899 +
20900 +    if (!unbound_irq(evtchn)) {
20901 +       evtchn = -EINVAL;
20902 +       goto out;
20903 +    }
20904 +
20905 +    evtchns[evtchn].handler = handler;
20906 +    evtchns[evtchn].dev_id = dev_id;
20907 +    evtchns[evtchn].opened = 1;
20908 +    irq_info[evtchn] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
20909 +
20910 +    unmask_evtchn(evtchn);
20911 +out:
20912 +    spin_unlock(&irq_mapping_update_lock);
20913 +    return evtchn;
20914 +}
20915 +
20916 +int bind_ipi_to_irqhandler(
20917 +       unsigned int ipi,
20918 +       unsigned int cpu,
20919 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
20920 +       unsigned long irqflags,
20921 +       const char *devname,
20922 +       void *dev_id)
20923 +{
20924 +    printk("%s is called which has not been supported now...?\n", __FUNCTION__);
20925 +    while(1);
20926 +}
20927 +
20928 +void unbind_from_irqhandler(unsigned int irq, void *dev_id)
20929 +{
20930 +    evtchn_op_t op;
20931 +    int evtchn = evtchn_from_irq(irq);
20932 +
20933 +    spin_lock(&irq_mapping_update_lock);
20934 +
20935 +    if (unbound_irq(irq))
20936 +        goto out;
20937 +
20938 +    op.cmd = EVTCHNOP_close;
20939 +    op.u.close.port = evtchn;
20940 +    BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
20941 +
20942 +    switch (type_from_irq(irq)) {
20943 +       case IRQT_VIRQ:
20944 +           /* Add smp stuff later... */
20945 +           break;
20946 +       case IRQT_IPI:
20947 +           /* Add smp stuff later... */
20948 +           break;
20949 +       default:
20950 +           break;
20951 +    }
20952 +
20953 +    mask_evtchn(evtchn);
20954 +    evtchns[evtchn].handler = NULL;
20955 +    evtchns[evtchn].opened = 0;
20956 +
20957 +out:
20958 +    spin_unlock(&irq_mapping_update_lock);
20959 +}
20960 +
20961 +void notify_remote_via_irq(int irq)
20962 +{
20963 +       int evtchn = evtchn_from_irq(irq);
20964 +
20965 +       if (!unbound_irq(evtchn))
20966 +               notify_remote_via_evtchn(evtchn);
20967 +}
20968 +
20969 +irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs)
20970 +{
20971 +    unsigned long  l1, l2;
20972 +    unsigned int   l1i, l2i, port;
20973 +    irqreturn_t (*handler)(int, void *, struct pt_regs *);
20974 +    shared_info_t *s = HYPERVISOR_shared_info;
20975 +    vcpu_info_t   *vcpu_info = &s->vcpu_info[smp_processor_id()];
20976 +
20977 +    vcpu_info->evtchn_upcall_mask = 1;
20978 +    vcpu_info->evtchn_upcall_pending = 0;
20979 +
20980 +    /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
20981 +    l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
20982 +    while ( l1 != 0 )
20983 +    {
20984 +        l1i = __ffs(l1);
20985 +        l1 &= ~(1UL << l1i);
20986 +
20987 +        while ( (l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]) != 0 )
20988 +        {
20989 +            l2i = __ffs(l2);
20990 +            l2 &= ~(1UL << l2i);
20991 +
20992 +            port = (l1i * BITS_PER_LONG) + l2i;
20993 +            if ( (handler = evtchns[port].handler) != NULL )
20994 +           {
20995 +               clear_evtchn(port);
20996 +                handler(port, evtchns[port].dev_id, regs);
20997 +           }
20998 +            else
20999 +           {
21000 +                evtchn_device_upcall(port);
21001 +           }
21002 +        }
21003 +    }
21004 +    vcpu_info->evtchn_upcall_mask = 0;
21005 +    return IRQ_HANDLED;
21006 +}
21007 +
21008 +void force_evtchn_callback(void)
21009 +{
21010 +       //(void)HYPERVISOR_xen_version(0, NULL);
21011 +}
21012 +
21013 +static struct irqaction evtchn_irqaction = {
21014 +       .handler =      evtchn_interrupt,
21015 +       .flags =        SA_INTERRUPT,
21016 +       .name =         "xen-event-channel"
21017 +};
21018 +
21019 +int evtchn_irq = 0xe9;
21020 +void __init evtchn_init(void)
21021 +{
21022 +    shared_info_t *s = HYPERVISOR_shared_info;
21023 +    vcpu_info_t   *vcpu_info = &s->vcpu_info[smp_processor_id()];
21024 +
21025 +#if 0
21026 +    int ret;
21027 +    irq = assign_irq_vector(AUTO_ASSIGN);
21028 +    ret = request_irq(irq, evtchn_interrupt, 0, "xen-event-channel", NULL);
21029 +    if (ret < 0)
21030 +    {
21031 +       printk("xen-event-channel unable to get irq %d (%d)\n", irq, ret);
21032 +       return;
21033 +    }
21034 +#endif
21035 +    register_percpu_irq(evtchn_irq, &evtchn_irqaction);
21036 +
21037 +    vcpu_info->arch.evtchn_vector = evtchn_irq;
21038 +    printk("xen-event-channel using irq %d\n", evtchn_irq);
21039 +
21040 +    spin_lock_init(&irq_mapping_update_lock);
21041 +    memset(evtchns, 0, sizeof(evtchns));
21042 +}
21043 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/Makefile tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/Makefile
21044 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/Makefile   1970-01-01 01:00:00.000000000 +0100
21045 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/Makefile      2006-04-10 00:05:52.000000000 +0200
21046 @@ -0,0 +1,20 @@
21047 +
21048 +obj-y   += util.o
21049 +
21050 +obj-y  += core/
21051 +obj-y  += console/
21052 +obj-y  += evtchn/
21053 +#obj-y += balloon/
21054 +obj-y  += privcmd/
21055 +obj-y  += blkback/
21056 +#obj-y += netback/
21057 +obj-y  += blkfront/
21058 +obj-y  += xenbus/
21059 +#obj-y += netfront/
21060 +#obj-$(CONFIG_XEN_PRIVILEGED_GUEST)    += privcmd/
21061 +#obj-$(CONFIG_XEN_BLKDEV_BACKEND)      += blkback/
21062 +#obj-$(CONFIG_XEN_NETDEV_BACKEND)      += netback/
21063 +#obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     += blkfront/
21064 +#obj-$(CONFIG_XEN_NETDEV_FRONTEND)     += netfront/
21065 +#obj-$(CONFIG_XEN_BLKDEV_TAP)          += blktap/
21066 +
21067 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/blkback.c.patch tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/blkback.c.patch
21068 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/blkback.c.patch    1970-01-01 01:00:00.000000000 +0100
21069 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/blkback.c.patch       2006-04-10 00:05:52.000000000 +0200
21070 @@ -0,0 +1,57 @@
21071 +diff -Naur xen/blkback/blkback.c xen.patched/blkback/blkback.c
21072 +--- xen/blkback/blkback.c      2005-09-23 10:54:50.000000000 -0600
21073 ++++ xen.patched/blkback/blkback.c      2005-09-23 10:57:51.000000000 -0600
21074 +@@ -30,10 +30,16 @@
21075 + static unsigned long mmap_vstart;
21076 + #define MMAP_PAGES                                            \
21077 +       (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
21078 ++#ifdef __ia64__
21079 ++static void *pending_vaddrs[MMAP_PAGES];
21080 ++#define MMAP_VADDR(_idx, _i) \
21081 ++      (unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
21082 ++#else
21083 + #define MMAP_VADDR(_req,_seg)                                         \
21084 +       (mmap_vstart +                                                  \
21085 +        ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
21086 +        ((_seg) * PAGE_SIZE))
21087 ++#endif
21088
21089 + /*
21090 +  * Each outstanding request that we've passed to the lower device layers has a 
21091 +@@ -377,9 +383,13 @@
21092 +                       goto bad_descriptor;
21093 +               }
21094
21095 ++#ifdef __ia64__
21096 ++              MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
21097 ++#else
21098 +               phys_to_machine_mapping[__pa(MMAP_VADDR(
21099 +                       pending_idx, i)) >> PAGE_SHIFT] =
21100 +                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
21101 ++#endif
21102
21103 +               pending_handle(pending_idx, i) = map[i].handle;
21104 +       }
21105 +@@ -500,9 +510,22 @@
21106
21107 +       blkif_interface_init();
21108
21109 ++#ifdef __ia64__
21110 ++    {
21111 ++      extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
21112 ++      int i;
21113 ++
21114 ++      mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
21115 ++      printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
21116 ++      for(i = 0; i < MMAP_PAGES; i++)
21117 ++          pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
21118 ++      BUG_ON(mmap_vstart == NULL);
21119 ++    }
21120 ++#else
21121 +       page = balloon_alloc_empty_page_range(MMAP_PAGES);
21122 +       BUG_ON(page == NULL);
21123 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
21124 ++#endif
21125
21126 +       pending_cons = 0;
21127 +       pending_prod = MAX_PENDING_REQS;
21128 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/console.c.patch tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/console.c.patch
21129 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/console.c.patch    1970-01-01 01:00:00.000000000 +0100
21130 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/console.c.patch       2006-04-10 00:05:52.000000000 +0200
21131 @@ -0,0 +1,18 @@
21132 +--- xen/console/console.c      2005-11-02 14:13:07.000000000 +0100
21133 ++++ xen.patched/console/console.c      2005-11-02 14:21:20.000000000 +0100
21134 +@@ -768,9 +771,15 @@
21135 + #endif
21136
21137 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
21138 ++#ifdef __ia64__
21139 ++              xencons_priv_irq = bind_virq_to_evtchn(VIRQ_CONSOLE);
21140 ++              bind_evtchn_to_irqhandler(xencons_priv_irq,
21141 ++                              xencons_priv_interrupt, 0, "console", NULL);
21142 ++#else
21143 +               xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
21144 +               (void)request_irq(xencons_priv_irq,
21145 +                                 xencons_priv_interrupt, 0, "console", NULL);
21146 ++#endif
21147 +       } else {
21148 +               xencons_ring_register_receiver(xencons_rx);
21149 +       }
21150 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/devmem.c.patch tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/devmem.c.patch
21151 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/devmem.c.patch     1970-01-01 01:00:00.000000000 +0100
21152 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/devmem.c.patch        2006-04-10 00:05:52.000000000 +0200
21153 @@ -0,0 +1,3 @@
21154 +diff -Naur xen/core/devmem.c xen.patched/core/devmem.c
21155 +--- xen/core/devmem.c  2005-09-23 10:54:50.000000000 -0600
21156 ++++ xen.patched/core/devmem.c  2005-09-23 10:57:51.000000000 -0600
21157 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/gnttab.c.patch tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/gnttab.c.patch
21158 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/gnttab.c.patch     1970-01-01 01:00:00.000000000 +0100
21159 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/gnttab.c.patch        2006-04-10 00:05:52.000000000 +0200
21160 @@ -0,0 +1,46 @@
21161 +diff -Naur xen/core/gnttab.c xen.patched/core/gnttab.c
21162 +--- xen/core/gnttab.c  2005-09-23 10:54:50.000000000 -0600
21163 ++++ xen.patched/core/gnttab.c  2005-09-23 10:57:51.000000000 -0600
21164 +@@ -346,6 +350,10 @@
21165 +       if ( hypercall.op != __HYPERVISOR_grant_table_op )
21166 +               return -ENOSYS;
21167
21168 ++
21169 ++#ifdef __ia64__
21170 ++      ret = HYPERVISOR_grant_table_op(hypercall.arg[0], (void *)hypercall.arg[1], hypercall.arg[2]);
21171 ++#else
21172 +       /* hypercall-invoking asm taken from privcmd.c */
21173 +       __asm__ __volatile__ (
21174 +               "pushl %%ebx; pushl %%ecx; pushl %%edx; "
21175 +@@ -359,6 +367,7 @@
21176 +               TRAP_INSTR "; "
21177 +               "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx"
21178 +               : "=a" (ret) : "0" (&hypercall) : "memory" );
21179 ++#endif
21180
21181 +       return ret;
21182 + }
21183 +@@ -423,8 +432,13 @@
21184 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1));
21185 +       BUG_ON(setup.status != 0);
21186
21187 ++#ifdef __ia64__
21188 ++      shared = __va(frames[0] << PAGE_SHIFT);
21189 ++      printk("grant table at %p\n", shared);
21190 ++#else
21191 +       for (i = 0; i < NR_GRANT_FRAMES; i++)
21192 +               set_fixmap(FIX_GNTTAB_END - i, frames[i] << PAGE_SHIFT);
21193 ++#endif
21194
21195 +       return 0;
21196 + }
21197 +@@ -450,7 +466,9 @@
21198
21199 +       BUG_ON(gnttab_resume());
21200
21201 ++#ifndef __ia64__
21202 +       shared = (grant_entry_t *)fix_to_virt(FIX_GNTTAB_END);
21203 ++#endif
21204
21205 +       for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
21206 +               gnttab_list[i] = i + 1;
21207 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/privcmd.c.patch tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/privcmd.c.patch
21208 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/patches/privcmd.c.patch    1970-01-01 01:00:00.000000000 +0100
21209 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/patches/privcmd.c.patch       2006-04-10 00:05:52.000000000 +0200
21210 @@ -0,0 +1,43 @@
21211 +diff -Naur xen/privcmd/privcmd.c xen.patched/privcmd/privcmd.c
21212 +--- xen/privcmd/privcmd.c      2005-09-23 10:54:50.000000000 -0600
21213 ++++ xen.patched/privcmd/privcmd.c      2005-09-23 10:57:51.000000000 -0600
21214 +@@ -180,6 +183,15 @@
21215 +               for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) {
21216 +                       if (get_user(mfn, p))
21217 +                               return -EFAULT;
21218 ++#ifdef __ia64__
21219 ++                      ret = remap_pfn_range(vma,
21220 ++                                            addr&PAGE_MASK,
21221 ++                                            mfn,
21222 ++                                            1<<PAGE_SHIFT,
21223 ++                                            vma->vm_page_prot);
21224 ++                      if (ret < 0)
21225 ++                          goto batch_err;
21226 ++#else
21227
21228 +                       ret = create_lookup_pte_addr(vma->vm_mm, addr, &ptep);
21229 +                       if (ret)
21230 +@@ -190,6 +202,7 @@
21231
21232 +                       if (HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0)
21233 +                               put_user(0xF0000000 | mfn, p);
21234 ++#endif
21235 +               }
21236
21237 +               ret = 0;
21238 +@@ -205,6 +218,7 @@
21239 +       break;
21240 + #endif
21241
21242 ++#ifndef __ia64__
21243 +       case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: {
21244 +               unsigned long m2pv = (unsigned long)machine_to_phys_mapping;
21245 +               pgd_t *pgd = pgd_offset_k(m2pv);
21246 +@@ -216,6 +230,7 @@
21247 +                       -EFAULT: 0;
21248 +       }
21249 +       break;
21250 ++#endif
21251
21252 +       default:
21253 +               ret = -EINVAL;
21254 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/README tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/README
21255 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/README     1970-01-01 01:00:00.000000000 +0100
21256 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/README        2006-04-10 00:05:52.000000000 +0200
21257 @@ -0,0 +1,2 @@
21258 +This is a temporary location for source/Makefiles that need to be
21259 +patched/reworked in drivers/xen to work with xenlinux/ia64.
21260 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/drivers/xenia64_init.c tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/xenia64_init.c
21261 --- ref-linux-2.6.16.9/arch/ia64/xen/drivers/xenia64_init.c     1970-01-01 01:00:00.000000000 +0100
21262 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/drivers/xenia64_init.c        2006-04-10 00:05:52.000000000 +0200
21263 @@ -0,0 +1,55 @@
21264 +#ifdef __ia64__
21265 +#include <linux/config.h>
21266 +#include <linux/module.h>
21267 +#include <linux/efi.h>
21268 +#include <asm/sal.h>
21269 +#include <asm/hypervisor.h>
21270 +/* #include <asm-xen/evtchn.h> */
21271 +#include <linux/vmalloc.h>
21272 +
21273 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)0xf100000000000000;
21274 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
21275 +
21276 +static int initialized;
21277 +start_info_t *xen_start_info;
21278 +
21279 +int xen_init(void)
21280 +{
21281 +       shared_info_t *s = HYPERVISOR_shared_info;
21282 +
21283 +       if (initialized)
21284 +               return running_on_xen ? 0 : -1;
21285 +
21286 +       if (!running_on_xen)
21287 +               return -1;
21288 +
21289 +       xen_start_info = __va(s->arch.start_info_pfn << PAGE_SHIFT);
21290 +       xen_start_info->flags = s->arch.flags;
21291 +       printk("Running on Xen! start_info_pfn=0x%lx nr_pages=%d flags=0x%x\n",
21292 +               s->arch.start_info_pfn, xen_start_info->nr_pages,
21293 +               xen_start_info->flags);
21294 +
21295 +       evtchn_init();
21296 +       initialized = 1;
21297 +       return 0;
21298 +}
21299 +
21300 +/* We just need a range of legal va here, though finally identity
21301 + * mapped one is instead used for gnttab mapping.
21302 + */
21303 +unsigned long alloc_empty_foreign_map_page_range(unsigned long pages)
21304 +{
21305 +       struct vm_struct *vma;
21306 +
21307 +       if ( (vma = get_vm_area(PAGE_SIZE * pages, VM_ALLOC)) == NULL )
21308 +               return NULL;
21309 +
21310 +       return (unsigned long)vma->addr;
21311 +}
21312 +
21313 +#if 0
21314 +/* These should be define'd but some drivers use them without
21315 + * a convenient arch include */
21316 +unsigned long mfn_to_pfn(unsigned long mfn) { return mfn; }
21317 +#endif
21318 +#endif
21319 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/hypercall.S tmp-linux-2.6-xen.patch/arch/ia64/xen/hypercall.S
21320 --- ref-linux-2.6.16.9/arch/ia64/xen/hypercall.S        1970-01-01 01:00:00.000000000 +0100
21321 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/hypercall.S   2006-04-10 00:05:52.000000000 +0200
21322 @@ -0,0 +1,365 @@
21323 +/*
21324 + * Support routines for Xen hypercalls
21325 + *
21326 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
21327 + */
21328 +
21329 +#include <linux/config.h>
21330 +#include <asm/processor.h>
21331 +#include <asm/asmmacro.h>
21332 +
21333 +GLOBAL_ENTRY(xen_get_ivr)
21334 +       movl r8=running_on_xen;;
21335 +       ld4 r8=[r8];;
21336 +       cmp.eq p7,p0=r8,r0;;
21337 +(p7)   mov r8=cr.ivr;;
21338 +(p7)   br.ret.sptk.many rp
21339 +       ;;
21340 +       movl r9=XSI_PSR_IC
21341 +       ;;
21342 +       ld8 r10=[r9]
21343 +       ;;
21344 +       st8 [r9]=r0
21345 +       ;;
21346 +       XEN_HYPER_GET_IVR
21347 +       ;;
21348 +       st8 [r9]=r10
21349 +       br.ret.sptk.many rp
21350 +       ;;
21351 +END(xen_get_ivr)
21352 +
21353 +GLOBAL_ENTRY(xen_get_tpr)
21354 +       movl r8=running_on_xen;;
21355 +       ld4 r8=[r8];;
21356 +       cmp.eq p7,p0=r8,r0;;
21357 +(p7)   mov r8=cr.tpr;;
21358 +(p7)   br.ret.sptk.many rp
21359 +       ;;
21360 +       movl r9=XSI_PSR_IC
21361 +       ;;
21362 +       ld8 r10=[r9]
21363 +       ;;
21364 +       st8 [r9]=r0
21365 +       ;;
21366 +       XEN_HYPER_GET_TPR
21367 +       ;;
21368 +       st8 [r9]=r10
21369 +       br.ret.sptk.many rp
21370 +       ;;
21371 +END(xen_get_tpr)
21372 +
21373 +GLOBAL_ENTRY(xen_set_tpr)
21374 +       movl r8=running_on_xen;;
21375 +       ld4 r8=[r8];;
21376 +       cmp.eq p7,p0=r8,r0;;
21377 +(p7)   mov cr.tpr=r32;;
21378 +(p7)   br.ret.sptk.many rp
21379 +       ;;
21380 +       movl r9=XSI_PSR_IC
21381 +       mov r8=r32
21382 +       ;;
21383 +       ld8 r10=[r9]
21384 +       ;;
21385 +       st8 [r9]=r0
21386 +       ;;
21387 +       XEN_HYPER_SET_TPR
21388 +       ;;
21389 +       st8 [r9]=r10
21390 +       br.ret.sptk.many rp
21391 +       ;;
21392 +END(xen_set_tpr)
21393 +
21394 +GLOBAL_ENTRY(xen_eoi)
21395 +       movl r8=running_on_xen;;
21396 +       ld4 r8=[r8];;
21397 +       cmp.eq p7,p0=r8,r0;;
21398 +(p7)   mov cr.eoi=r0;;
21399 +(p7)   br.ret.sptk.many rp
21400 +       ;;
21401 +       movl r9=XSI_PSR_IC
21402 +       mov r8=r32
21403 +       ;;
21404 +       ld8 r10=[r9]
21405 +       ;;
21406 +       st8 [r9]=r0
21407 +       ;;
21408 +       XEN_HYPER_EOI
21409 +       ;;
21410 +       st8 [r9]=r10
21411 +       br.ret.sptk.many rp
21412 +       ;;
21413 +END(xen_eoi)
21414 +
21415 +GLOBAL_ENTRY(xen_thash)
21416 +       movl r8=running_on_xen;;
21417 +       ld4 r8=[r8];;
21418 +       cmp.eq p7,p0=r8,r0;;
21419 +(p7)   thash r8=r32;;
21420 +(p7)   br.ret.sptk.many rp
21421 +       ;;
21422 +       movl r9=XSI_PSR_IC
21423 +       mov r8=r32
21424 +       ;;
21425 +       ld8 r10=[r9]
21426 +       ;;
21427 +       st8 [r9]=r0
21428 +       ;;
21429 +       XEN_HYPER_THASH
21430 +       ;;
21431 +       st8 [r9]=r10
21432 +       ;;
21433 +       br.ret.sptk.many rp
21434 +       ;;
21435 +END(xen_thash)
21436 +
21437 +GLOBAL_ENTRY(xen_set_itm)
21438 +       movl r8=running_on_xen;;
21439 +       ld4 r8=[r8];;
21440 +       cmp.eq p7,p0=r8,r0;;
21441 +(p7)   mov cr.itm=r32;;
21442 +(p7)   br.ret.sptk.many rp
21443 +       ;;
21444 +       movl r9=XSI_PSR_IC
21445 +       mov r8=r32
21446 +       ;;
21447 +       ld8 r10=[r9]
21448 +       ;;
21449 +       st8 [r9]=r0
21450 +       ;;
21451 +       XEN_HYPER_SET_ITM
21452 +       ;;
21453 +       st8 [r9]=r10
21454 +       ;;
21455 +       br.ret.sptk.many rp
21456 +       ;;
21457 +END(xen_set_itm)
21458 +
21459 +GLOBAL_ENTRY(xen_ptcga)
21460 +       movl r8=running_on_xen;;
21461 +       ld4 r8=[r8];;
21462 +       cmp.eq p7,p0=r8,r0;;
21463 +(p7)   ptc.ga r32,r33;;
21464 +(p7)   br.ret.sptk.many rp
21465 +       ;;
21466 +       movl r11=XSI_PSR_IC
21467 +       mov r8=r32
21468 +       mov r9=r33
21469 +       ;;
21470 +       ld8 r10=[r11]
21471 +       ;;
21472 +       st8 [r11]=r0
21473 +       ;;
21474 +       XEN_HYPER_PTC_GA
21475 +       ;;
21476 +       st8 [r11]=r10
21477 +       ;;
21478 +       br.ret.sptk.many rp
21479 +       ;;
21480 +END(xen_ptcga)
21481 +
21482 +GLOBAL_ENTRY(xen_get_rr)
21483 +       movl r8=running_on_xen;;
21484 +       ld4 r8=[r8];;
21485 +       cmp.eq p7,p0=r8,r0;;
21486 +(p7)   mov r8=rr[r32];;
21487 +(p7)   br.ret.sptk.many rp
21488 +       ;;
21489 +       movl r9=XSI_PSR_IC
21490 +       mov r8=r32
21491 +       ;;
21492 +       ld8 r10=[r9]
21493 +       ;;
21494 +       st8 [r9]=r0
21495 +       ;;
21496 +       XEN_HYPER_GET_RR
21497 +       ;;
21498 +       st8 [r9]=r10
21499 +       ;;
21500 +       br.ret.sptk.many rp
21501 +       ;;
21502 +END(xen_get_rr)
21503 +
21504 +GLOBAL_ENTRY(xen_set_rr)
21505 +       movl r8=running_on_xen;;
21506 +       ld4 r8=[r8];;
21507 +       cmp.eq p7,p0=r8,r0;;
21508 +(p7)   mov rr[r32]=r33;;
21509 +(p7)   br.ret.sptk.many rp
21510 +       ;;
21511 +       movl r11=XSI_PSR_IC
21512 +       mov r8=r32
21513 +       mov r9=r33
21514 +       ;;
21515 +       ld8 r10=[r11]
21516 +       ;;
21517 +       st8 [r11]=r0
21518 +       ;;
21519 +       XEN_HYPER_SET_RR
21520 +       ;;
21521 +       st8 [r11]=r10
21522 +       ;;
21523 +       br.ret.sptk.many rp
21524 +       ;;
21525 +END(xen_set_rr)
21526 +
21527 +GLOBAL_ENTRY(xen_set_kr)
21528 +       movl r8=running_on_xen;;
21529 +       ld4 r8=[r8];;
21530 +       cmp.ne p7,p0=r8,r0;;
21531 +(p7)   br.cond.spnt.few 1f;
21532 +       ;;
21533 +       cmp.eq p7,p0=r8,r0
21534 +       adds r8=-1,r8;;
21535 +(p7)   mov ar0=r9
21536 +(p7)   br.ret.sptk.many rp;;
21537 +       cmp.eq p7,p0=r8,r0
21538 +       adds r8=-1,r8;;
21539 +(p7)   mov ar1=r9
21540 +(p7)   br.ret.sptk.many rp;;
21541 +       cmp.eq p7,p0=r8,r0
21542 +       adds r8=-1,r8;;
21543 +(p7)   mov ar2=r9
21544 +(p7)   br.ret.sptk.many rp;;
21545 +       cmp.eq p7,p0=r8,r0
21546 +       adds r8=-1,r8;;
21547 +(p7)   mov ar3=r9
21548 +(p7)   br.ret.sptk.many rp;;
21549 +       cmp.eq p7,p0=r8,r0
21550 +       adds r8=-1,r8;;
21551 +(p7)   mov ar4=r9
21552 +(p7)   br.ret.sptk.many rp;;
21553 +       cmp.eq p7,p0=r8,r0
21554 +       adds r8=-1,r8;;
21555 +(p7)   mov ar5=r9
21556 +(p7)   br.ret.sptk.many rp;;
21557 +       cmp.eq p7,p0=r8,r0
21558 +       adds r8=-1,r8;;
21559 +(p7)   mov ar6=r9
21560 +(p7)   br.ret.sptk.many rp;;
21561 +       cmp.eq p7,p0=r8,r0
21562 +       adds r8=-1,r8;;
21563 +(p7)   mov ar7=r9
21564 +(p7)   br.ret.sptk.many rp;;
21565 +
21566 +1:     movl r11=XSI_PSR_IC
21567 +       mov r8=r32
21568 +       mov r9=r33
21569 +       ;;
21570 +       ld8 r10=[r11]
21571 +       ;;
21572 +       st8 [r11]=r0
21573 +       ;;
21574 +       XEN_HYPER_SET_KR
21575 +       ;;
21576 +       st8 [r11]=r10
21577 +       ;;
21578 +       br.ret.sptk.many rp
21579 +END(xen_set_rr)
21580 +
21581 +GLOBAL_ENTRY(xen_fc)
21582 +       movl r8=running_on_xen;;
21583 +       ld4 r8=[r8];;
21584 +       cmp.eq p7,p0=r8,r0;;
21585 +(p7)   fc r32;;
21586 +(p7)   br.ret.sptk.many rp
21587 +       ;;
21588 +       movl r9=XSI_PSR_IC
21589 +       mov r8=r32
21590 +       ;;
21591 +       ld8 r10=[r9]
21592 +       ;;
21593 +       st8 [r9]=r0
21594 +       ;;
21595 +       XEN_HYPER_FC
21596 +       ;;
21597 +       st8 [r9]=r10
21598 +       ;;
21599 +       br.ret.sptk.many rp
21600 +END(xen_fc)
21601 +
21602 +GLOBAL_ENTRY(xen_get_cpuid)
21603 +       movl r8=running_on_xen;;
21604 +       ld4 r8=[r8];;
21605 +       cmp.eq p7,p0=r8,r0;;
21606 +(p7)   mov r8=cpuid[r32];;
21607 +(p7)   br.ret.sptk.many rp
21608 +       ;;
21609 +       movl r9=XSI_PSR_IC
21610 +       mov r8=r32
21611 +       ;;
21612 +       ld8 r10=[r9]
21613 +       ;;
21614 +       st8 [r9]=r0
21615 +       ;;
21616 +       XEN_HYPER_GET_CPUID
21617 +       ;;
21618 +       st8 [r9]=r10
21619 +       ;;
21620 +       br.ret.sptk.many rp
21621 +END(xen_get_cpuid)
21622 +
21623 +GLOBAL_ENTRY(xen_get_pmd)
21624 +       movl r8=running_on_xen;;
21625 +       ld4 r8=[r8];;
21626 +       cmp.eq p7,p0=r8,r0;;
21627 +(p7)   mov r8=pmd[r32];;
21628 +(p7)   br.ret.sptk.many rp
21629 +       ;;
21630 +       movl r9=XSI_PSR_IC
21631 +       mov r8=r32
21632 +       ;;
21633 +       ld8 r10=[r9]
21634 +       ;;
21635 +       st8 [r9]=r0
21636 +       ;;
21637 +       XEN_HYPER_GET_PMD
21638 +       ;;
21639 +       st8 [r9]=r10
21640 +       ;;
21641 +       br.ret.sptk.many rp
21642 +END(xen_get_pmd)
21643 +
21644 +#ifdef CONFIG_IA32_SUPPORT
21645 +GLOBAL_ENTRY(xen_get_eflag)
21646 +       movl r8=running_on_xen;;
21647 +       ld4 r8=[r8];;
21648 +       cmp.eq p7,p0=r8,r0;;
21649 +(p7)   mov r8=ar24;;
21650 +(p7)   br.ret.sptk.many rp
21651 +       ;;
21652 +       movl r9=XSI_PSR_IC
21653 +       mov r8=r32
21654 +       ;;
21655 +       ld8 r10=[r9]
21656 +       ;;
21657 +       st8 [r9]=r0
21658 +       ;;
21659 +       XEN_HYPER_GET_EFLAG
21660 +       ;;
21661 +       st8 [r9]=r10
21662 +       ;;
21663 +       br.ret.sptk.many rp
21664 +END(xen_get_eflag)
21665 +       
21666 +// some bits aren't set if pl!=0, see SDM vol1 3.1.8
21667 +GLOBAL_ENTRY(xen_set_eflag)
21668 +       movl r8=running_on_xen;;
21669 +       ld4 r8=[r8];;
21670 +       cmp.eq p7,p0=r8,r0;;
21671 +(p7)   mov ar24=r32
21672 +(p7)   br.ret.sptk.many rp
21673 +       ;;
21674 +       movl r9=XSI_PSR_IC
21675 +       mov r8=r32
21676 +       ;;
21677 +       ld8 r10=[r9]
21678 +       ;;
21679 +       st8 [r9]=r0
21680 +       ;;
21681 +       XEN_HYPER_SET_EFLAG
21682 +       ;;
21683 +       st8 [r9]=r10
21684 +       ;;
21685 +       br.ret.sptk.many rp
21686 +END(xen_set_eflag)
21687 +#endif
21688 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/Makefile tmp-linux-2.6-xen.patch/arch/ia64/xen/Makefile
21689 --- ref-linux-2.6.16.9/arch/ia64/xen/Makefile   1970-01-01 01:00:00.000000000 +0100
21690 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/Makefile      2006-04-10 00:05:52.000000000 +0200
21691 @@ -0,0 +1,5 @@
21692 +#
21693 +# Makefile for Xen components
21694 +#
21695 +
21696 +obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o xenconsole.o xen_ksyms.o
21697 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xenconsole.c tmp-linux-2.6-xen.patch/arch/ia64/xen/xenconsole.c
21698 --- ref-linux-2.6.16.9/arch/ia64/xen/xenconsole.c       1970-01-01 01:00:00.000000000 +0100
21699 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xenconsole.c  2006-04-10 00:05:52.000000000 +0200
21700 @@ -0,0 +1,19 @@
21701 +#include <linux/config.h>
21702 +#include <linux/console.h>
21703 +
21704 +int
21705 +early_xen_console_setup (char *cmdline)
21706 +{
21707 +#ifdef CONFIG_XEN
21708 +#ifndef CONFIG_IA64_HP_SIM
21709 +       extern int running_on_xen;
21710 +       if (running_on_xen) {
21711 +               extern struct console hpsim_cons;
21712 +               hpsim_cons.flags |= CON_BOOT;
21713 +               register_console(&hpsim_cons);
21714 +               return 0;
21715 +       }
21716 +#endif
21717 +#endif
21718 +       return -1;
21719 +}
21720 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xenentry.S tmp-linux-2.6-xen.patch/arch/ia64/xen/xenentry.S
21721 --- ref-linux-2.6.16.9/arch/ia64/xen/xenentry.S 1970-01-01 01:00:00.000000000 +0100
21722 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xenentry.S    2006-04-10 00:05:52.000000000 +0200
21723 @@ -0,0 +1,850 @@
21724 +/*
21725 + * ia64/xen/entry.S
21726 + *
21727 + * Alternate kernel routines for Xen.  Heavily leveraged from
21728 + *   ia64/kernel/entry.S
21729 + *
21730 + * Copyright (C) 2005 Hewlett-Packard Co
21731 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
21732 + */
21733 +
21734 +#include <linux/config.h>
21735 +
21736 +#include <asm/asmmacro.h>
21737 +#include <asm/cache.h>
21738 +#include <asm/errno.h>
21739 +#include <asm/kregs.h>
21740 +#include <asm/asm-offsets.h>
21741 +#include <asm/pgtable.h>
21742 +#include <asm/percpu.h>
21743 +#include <asm/processor.h>
21744 +#include <asm/thread_info.h>
21745 +#include <asm/unistd.h>
21746 +
21747 +#ifdef CONFIG_XEN
21748 +#include "xenminstate.h"
21749 +#else
21750 +#include "minstate.h"
21751 +#endif
21752 +
21753 +/*
21754 + * prev_task <- ia64_switch_to(struct task_struct *next)
21755 + *     With Ingo's new scheduler, interrupts are disabled when this routine gets
21756 + *     called.  The code starting at .map relies on this.  The rest of the code
21757 + *     doesn't care about the interrupt masking status.
21758 + */
21759 +#ifdef CONFIG_XEN
21760 +GLOBAL_ENTRY(xen_switch_to)
21761 +       .prologue
21762 +       alloc r16=ar.pfs,1,0,0,0
21763 +       movl r22=running_on_xen;;
21764 +       ld4 r22=[r22];;
21765 +       cmp.eq p7,p0=r22,r0
21766 +(p7)   br.cond.sptk.many __ia64_switch_to;;
21767 +#else
21768 +GLOBAL_ENTRY(ia64_switch_to)
21769 +       .prologue
21770 +       alloc r16=ar.pfs,1,0,0,0
21771 +#endif
21772 +       DO_SAVE_SWITCH_STACK
21773 +       .body
21774 +
21775 +       adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
21776 +       movl r25=init_task
21777 +       mov r27=IA64_KR(CURRENT_STACK)
21778 +       adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
21779 +       dep r20=0,in0,61,3              // physical address of "next"
21780 +       ;;
21781 +       st8 [r22]=sp                    // save kernel stack pointer of old task
21782 +       shr.u r26=r20,IA64_GRANULE_SHIFT
21783 +       cmp.eq p7,p6=r25,in0
21784 +       ;;
21785 +#ifdef CONFIG_XEN
21786 +       movl r8=XSI_PSR_IC
21787 +       ;;
21788 +       st4 [r8]=r0     // force psr.ic off for hyperprivop(s)
21789 +       ;;
21790 +#endif
21791 +       /*
21792 +        * If we've already mapped this task's page, we can skip doing it again.
21793 +        */
21794 +(p6)   cmp.eq p7,p6=r26,r27
21795 +(p6)   br.cond.dpnt .map
21796 +       ;;
21797 +.done:
21798 +#ifdef CONFIG_XEN
21799 +       // psr.ic already off
21800 +       // update "current" application register
21801 +       mov r8=IA64_KR_CURRENT
21802 +       mov r9=in0;;
21803 +       XEN_HYPER_SET_KR
21804 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
21805 +       movl r27=XSI_PSR_IC
21806 +       mov r8=1
21807 +       ;;
21808 +       st4 [r27]=r8                    // psr.ic back on
21809 +       ;;
21810 +#else
21811 +(p6)   ssm psr.ic                      // if we had to map, reenable the psr.ic bit FIRST!!!
21812 +       ;;
21813 +(p6)   srlz.d
21814 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
21815 +       mov IA64_KR(CURRENT)=in0        // update "current" application register
21816 +#endif
21817 +       mov r8=r13                      // return pointer to previously running task
21818 +       mov r13=in0                     // set "current" pointer
21819 +       ;;
21820 +       DO_LOAD_SWITCH_STACK
21821 +
21822 +#ifdef CONFIG_SMP
21823 +       sync.i                          // ensure "fc"s done by this CPU are visible on other CPUs
21824 +#endif
21825 +       br.ret.sptk.many rp             // boogie on out in new context
21826 +
21827 +.map:
21828 +#ifdef CONFIG_XEN
21829 +       // psr.ic already off
21830 +#else
21831 +       rsm psr.ic                      // interrupts (psr.i) are already disabled here
21832 +#endif
21833 +       movl r25=PAGE_KERNEL
21834 +       ;;
21835 +       srlz.d
21836 +       or r23=r25,r20                  // construct PA | page properties
21837 +       mov r25=IA64_GRANULE_SHIFT<<2
21838 +       ;;
21839 +#ifdef CONFIG_XEN
21840 +       movl r8=XSI_ITIR
21841 +       ;;
21842 +       st8 [r8]=r25
21843 +       ;;
21844 +       movl r8=XSI_IFA
21845 +       ;;
21846 +       st8 [r8]=in0                     // VA of next task...
21847 +       ;;
21848 +       mov r25=IA64_TR_CURRENT_STACK
21849 +       // remember last page we mapped...
21850 +       mov r8=IA64_KR_CURRENT_STACK
21851 +       mov r9=r26;;
21852 +       XEN_HYPER_SET_KR;;
21853 +#else
21854 +       mov cr.itir=r25
21855 +       mov cr.ifa=in0                  // VA of next task...
21856 +       ;;
21857 +       mov r25=IA64_TR_CURRENT_STACK
21858 +       mov IA64_KR(CURRENT_STACK)=r26  // remember last page we mapped...
21859 +#endif
21860 +       ;;
21861 +       itr.d dtr[r25]=r23              // wire in new mapping...
21862 +       br.cond.sptk .done
21863 +#ifdef CONFIG_XEN
21864 +END(xen_switch_to)
21865 +#else
21866 +END(ia64_switch_to)
21867 +#endif
21868 +
21869 +       /*
21870 +        * Invoke a system call, but do some tracing before and after the call.
21871 +        * We MUST preserve the current register frame throughout this routine
21872 +        * because some system calls (such as ia64_execve) directly
21873 +        * manipulate ar.pfs.
21874 +        */
21875 +#ifdef CONFIG_XEN
21876 +GLOBAL_ENTRY(xen_trace_syscall)
21877 +       PT_REGS_UNWIND_INFO(0)
21878 +       movl r16=running_on_xen;;
21879 +       ld4 r16=[r16];;
21880 +       cmp.eq p7,p0=r16,r0
21881 +(p7)   br.cond.sptk.many __ia64_trace_syscall;;
21882 +#else
21883 +GLOBAL_ENTRY(ia64_trace_syscall)
21884 +       PT_REGS_UNWIND_INFO(0)
21885 +#endif
21886 +       /*
21887 +        * We need to preserve the scratch registers f6-f11 in case the system
21888 +        * call is sigreturn.
21889 +        */
21890 +       adds r16=PT(F6)+16,sp
21891 +       adds r17=PT(F7)+16,sp
21892 +       ;;
21893 +       stf.spill [r16]=f6,32
21894 +       stf.spill [r17]=f7,32
21895 +       ;;
21896 +       stf.spill [r16]=f8,32
21897 +       stf.spill [r17]=f9,32
21898 +       ;;
21899 +       stf.spill [r16]=f10
21900 +       stf.spill [r17]=f11
21901 +       br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
21902 +       adds r16=PT(F6)+16,sp
21903 +       adds r17=PT(F7)+16,sp
21904 +       ;;
21905 +       ldf.fill f6=[r16],32
21906 +       ldf.fill f7=[r17],32
21907 +       ;;
21908 +       ldf.fill f8=[r16],32
21909 +       ldf.fill f9=[r17],32
21910 +       ;;
21911 +       ldf.fill f10=[r16]
21912 +       ldf.fill f11=[r17]
21913 +       // the syscall number may have changed, so re-load it and re-calculate the
21914 +       // syscall entry-point:
21915 +       adds r15=PT(R15)+16,sp                  // r15 = &pt_regs.r15 (syscall #)
21916 +       ;;
21917 +       ld8 r15=[r15]
21918 +       mov r3=NR_syscalls - 1
21919 +       ;;
21920 +       adds r15=-1024,r15
21921 +       movl r16=sys_call_table
21922 +       ;;
21923 +       shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
21924 +       cmp.leu p6,p7=r15,r3
21925 +       ;;
21926 +(p6)   ld8 r20=[r20]                           // load address of syscall entry point
21927 +(p7)   movl r20=sys_ni_syscall
21928 +       ;;
21929 +       mov b6=r20
21930 +       br.call.sptk.many rp=b6                 // do the syscall
21931 +.strace_check_retval:
21932 +       cmp.lt p6,p0=r8,r0                      // syscall failed?
21933 +       adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
21934 +       adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
21935 +       mov r10=0
21936 +(p6)   br.cond.sptk strace_error               // syscall failed ->
21937 +       ;;                                      // avoid RAW on r10
21938 +.strace_save_retval:
21939 +.mem.offset 0,0; st8.spill [r2]=r8             // store return value in slot for r8
21940 +.mem.offset 8,0; st8.spill [r3]=r10            // clear error indication in slot for r10
21941 +       br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
21942 +.ret3: br.cond.sptk .work_pending_syscall_end
21943 +
21944 +strace_error:
21945 +       ld8 r3=[r2]                             // load pt_regs.r8
21946 +       sub r9=0,r8                             // negate return value to get errno value
21947 +       ;;
21948 +       cmp.ne p6,p0=r3,r0                      // is pt_regs.r8!=0?
21949 +       adds r3=16,r2                           // r3=&pt_regs.r10
21950 +       ;;
21951 +(p6)   mov r10=-1
21952 +(p6)   mov r8=r9
21953 +       br.cond.sptk .strace_save_retval
21954 +#ifdef CONFIG_XEN
21955 +END(xen_trace_syscall)
21956 +#else
21957 +END(ia64_trace_syscall)
21958 +#endif
21959 +
21960 +/*
21961 + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
21962 + *     need to switch to bank 0 and doesn't restore the scratch registers.
21963 + *     To avoid leaking kernel bits, the scratch registers are set to
21964 + *     the following known-to-be-safe values:
21965 + *
21966 + *               r1: restored (global pointer)
21967 + *               r2: cleared
21968 + *               r3: 1 (when returning to user-level)
21969 + *           r8-r11: restored (syscall return value(s))
21970 + *              r12: restored (user-level stack pointer)
21971 + *              r13: restored (user-level thread pointer)
21972 + *              r14: cleared
21973 + *              r15: restored (syscall #)
21974 + *          r16-r17: cleared
21975 + *              r18: user-level b6
21976 + *              r19: cleared
21977 + *              r20: user-level ar.fpsr
21978 + *              r21: user-level b0
21979 + *              r22: cleared
21980 + *              r23: user-level ar.bspstore
21981 + *              r24: user-level ar.rnat
21982 + *              r25: user-level ar.unat
21983 + *              r26: user-level ar.pfs
21984 + *              r27: user-level ar.rsc
21985 + *              r28: user-level ip
21986 + *              r29: user-level psr
21987 + *              r30: user-level cfm
21988 + *              r31: user-level pr
21989 + *           f6-f11: cleared
21990 + *               pr: restored (user-level pr)
21991 + *               b0: restored (user-level rp)
21992 + *               b6: restored
21993 + *               b7: cleared
21994 + *          ar.unat: restored (user-level ar.unat)
21995 + *           ar.pfs: restored (user-level ar.pfs)
21996 + *           ar.rsc: restored (user-level ar.rsc)
21997 + *          ar.rnat: restored (user-level ar.rnat)
21998 + *      ar.bspstore: restored (user-level ar.bspstore)
21999 + *          ar.fpsr: restored (user-level ar.fpsr)
22000 + *           ar.ccv: cleared
22001 + *           ar.csd: cleared
22002 + *           ar.ssd: cleared
22003 + */
22004 +#ifdef CONFIG_XEN
22005 +GLOBAL_ENTRY(xen_leave_syscall)
22006 +       PT_REGS_UNWIND_INFO(0)
22007 +       movl r22=running_on_xen;;
22008 +       ld4 r22=[r22];;
22009 +       cmp.eq p7,p0=r22,r0
22010 +(p7)   br.cond.sptk.many __ia64_leave_syscall;;
22011 +#else
22012 +ENTRY(ia64_leave_syscall)
22013 +       PT_REGS_UNWIND_INFO(0)
22014 +#endif
22015 +       /*
22016 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
22017 +        * user- or fsys-mode, hence we disable interrupts early on.
22018 +        *
22019 +        * p6 controls whether current_thread_info()->flags needs to be check for
22020 +        * extra work.  We always check for extra work when returning to user-level.
22021 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
22022 +        * is 0.  After extra work processing has been completed, execution
22023 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
22024 +        * needs to be redone.
22025 +        */
22026 +#ifdef CONFIG_PREEMPT
22027 +       rsm psr.i                               // disable interrupts
22028 +       cmp.eq pLvSys,p0=r0,r0                  // pLvSys=1: leave from syscall
22029 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
22030 +       ;;
22031 +       .pred.rel.mutex pUStk,pKStk
22032 +(pKStk) ld4 r21=[r20]                  // r21 <- preempt_count
22033 +(pUStk)        mov r21=0                       // r21 <- 0
22034 +       ;;
22035 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
22036 +#else /* !CONFIG_PREEMPT */
22037 +#ifdef CONFIG_XEN
22038 +       movl r2=XSI_PSR_I
22039 +       ;;
22040 +(pUStk)        st4 [r2]=r0
22041 +#else
22042 +(pUStk)        rsm psr.i
22043 +#endif
22044 +       cmp.eq pLvSys,p0=r0,r0          // pLvSys=1: leave from syscall
22045 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
22046 +#endif
22047 +.work_processed_syscall:
22048 +       adds r2=PT(LOADRS)+16,r12
22049 +       adds r3=PT(AR_BSPSTORE)+16,r12
22050 +       adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
22051 +       ;;
22052 +(p6)   ld4 r31=[r18]                           // load current_thread_info()->flags
22053 +       ld8 r19=[r2],PT(B6)-PT(LOADRS)          // load ar.rsc value for "loadrs"
22054 +       mov b7=r0               // clear b7
22055 +       ;;
22056 +       ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)    // load ar.bspstore (may be garbage)
22057 +       ld8 r18=[r2],PT(R9)-PT(B6)              // load b6
22058 +(p6)   and r15=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
22059 +       ;;
22060 +       mov r16=ar.bsp                          // M2  get existing backing store pointer
22061 +(p6)   cmp4.ne.unc p6,p0=r15, r0               // any special work pending?
22062 +(p6)   br.cond.spnt .work_pending_syscall
22063 +       ;;
22064 +       // start restoring the state saved on the kernel stack (struct pt_regs):
22065 +       ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
22066 +       ld8 r11=[r3],PT(CR_IIP)-PT(R11)
22067 +       mov f6=f0               // clear f6
22068 +       ;;
22069 +       invala                  // M0|1 invalidate ALAT
22070 +#ifdef CONFIG_XEN
22071 +       movl r29=XSI_PSR_IC
22072 +       ;;
22073 +       st8     [r29]=r0        // note: clears both vpsr.i and vpsr.ic!
22074 +       ;;
22075 +#else
22076 +       rsm psr.i | psr.ic      // M2 initiate turning off of interrupt and interruption collection
22077 +#endif
22078 +       mov f9=f0               // clear f9
22079 +
22080 +       ld8 r29=[r2],16         // load cr.ipsr
22081 +       ld8 r28=[r3],16                 // load cr.iip
22082 +       mov f8=f0               // clear f8
22083 +       ;;
22084 +       ld8 r30=[r2],16         // M0|1 load cr.ifs
22085 +       mov.m ar.ssd=r0         // M2 clear ar.ssd
22086 +       cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
22087 +       ;;
22088 +       ld8 r25=[r3],16         // M0|1 load ar.unat
22089 +       mov.m ar.csd=r0         // M2 clear ar.csd
22090 +       mov r22=r0              // clear r22
22091 +       ;;
22092 +       ld8 r26=[r2],PT(B0)-PT(AR_PFS)  // M0|1 load ar.pfs
22093 +(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
22094 +       mov f10=f0              // clear f10
22095 +       ;;
22096 +       ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
22097 +       ld8 r27=[r3],PT(PR)-PT(AR_RSC)  // load ar.rsc
22098 +       mov f11=f0              // clear f11
22099 +       ;;
22100 +       ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)    // load ar.rnat (may be garbage)
22101 +       ld8 r31=[r3],PT(R1)-PT(PR)              // load predicates
22102 +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
22103 +       ;;
22104 +       ld8 r20=[r2],PT(R12)-PT(AR_FPSR)        // load ar.fpsr
22105 +       ld8.fill r1=[r3],16     // load r1
22106 +(pUStk) mov r17=1
22107 +       ;;
22108 +       srlz.d                  // M0  ensure interruption collection is off
22109 +       ld8.fill r13=[r3],16
22110 +       mov f7=f0               // clear f7
22111 +       ;;
22112 +       ld8.fill r12=[r2]       // restore r12 (sp)
22113 +       ld8.fill r15=[r3]       // restore r15
22114 +       addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
22115 +       ;;
22116 +(pUStk)        ld4 r3=[r3]             // r3 = cpu_data->phys_stacked_size_p8
22117 +(pUStk) st1 [r14]=r17
22118 +       mov b6=r18              // I0  restore b6
22119 +       ;;
22120 +       mov r14=r0              // clear r14
22121 +       shr.u r18=r19,16        // I0|1 get byte size of existing "dirty" partition
22122 +(pKStk) br.cond.dpnt.many skip_rbs_switch
22123 +
22124 +       mov.m ar.ccv=r0         // clear ar.ccv
22125 +(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
22126 +       br.cond.sptk.many rbs_switch
22127 +#ifdef CONFIG_XEN
22128 +END(xen_leave_syscall)
22129 +#else
22130 +END(ia64_leave_syscall)
22131 +#endif
22132 +
22133 +#ifdef CONFIG_XEN
22134 +GLOBAL_ENTRY(xen_leave_kernel)
22135 +       PT_REGS_UNWIND_INFO(0)
22136 +       movl r22=running_on_xen;;
22137 +       ld4 r22=[r22];;
22138 +       cmp.eq p7,p0=r22,r0
22139 +(p7)   br.cond.sptk.many __ia64_leave_kernel;;
22140 +#else
22141 +GLOBAL_ENTRY(ia64_leave_kernel)
22142 +       PT_REGS_UNWIND_INFO(0)
22143 +#endif
22144 +       /*
22145 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
22146 +        * user- or fsys-mode, hence we disable interrupts early on.
22147 +        *
22148 +        * p6 controls whether current_thread_info()->flags needs to be check for
22149 +        * extra work.  We always check for extra work when returning to user-level.
22150 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
22151 +        * is 0.  After extra work processing has been completed, execution
22152 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
22153 +        * needs to be redone.
22154 +        */
22155 +#ifdef CONFIG_PREEMPT
22156 +       rsm psr.i                               // disable interrupts
22157 +       cmp.eq p0,pLvSys=r0,r0                  // pLvSys=0: leave from kernel
22158 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
22159 +       ;;
22160 +       .pred.rel.mutex pUStk,pKStk
22161 +(pKStk)        ld4 r21=[r20]                   // r21 <- preempt_count
22162 +(pUStk)        mov r21=0                       // r21 <- 0
22163 +       ;;
22164 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
22165 +#else
22166 +#ifdef CONFIG_XEN
22167 +(pUStk)        movl r17=XSI_PSR_I
22168 +       ;;
22169 +(pUStk)        st4 [r17]=r0
22170 +       ;;
22171 +#else
22172 +(pUStk)        rsm psr.i
22173 +#endif
22174 +       cmp.eq p0,pLvSys=r0,r0          // pLvSys=0: leave from kernel
22175 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
22176 +#endif
22177 +.work_processed_kernel:
22178 +       adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
22179 +       ;;
22180 +(p6)   ld4 r31=[r17]                           // load current_thread_info()->flags
22181 +       adds r21=PT(PR)+16,r12
22182 +       ;;
22183 +
22184 +       lfetch [r21],PT(CR_IPSR)-PT(PR)
22185 +       adds r2=PT(B6)+16,r12
22186 +       adds r3=PT(R16)+16,r12
22187 +       ;;
22188 +       lfetch [r21]
22189 +       ld8 r28=[r2],8          // load b6
22190 +       adds r29=PT(R24)+16,r12
22191 +
22192 +       ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
22193 +       adds r30=PT(AR_CCV)+16,r12
22194 +(p6)   and r19=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
22195 +       ;;
22196 +       ld8.fill r24=[r29]
22197 +       ld8 r15=[r30]           // load ar.ccv
22198 +(p6)   cmp4.ne.unc p6,p0=r19, r0               // any special work pending?
22199 +       ;;
22200 +       ld8 r29=[r2],16         // load b7
22201 +       ld8 r30=[r3],16         // load ar.csd
22202 +(p6)   br.cond.spnt .work_pending
22203 +       ;;
22204 +       ld8 r31=[r2],16         // load ar.ssd
22205 +       ld8.fill r8=[r3],16
22206 +       ;;
22207 +       ld8.fill r9=[r2],16
22208 +       ld8.fill r10=[r3],PT(R17)-PT(R10)
22209 +       ;;
22210 +       ld8.fill r11=[r2],PT(R18)-PT(R11)
22211 +       ld8.fill r17=[r3],16
22212 +       ;;
22213 +       ld8.fill r18=[r2],16
22214 +       ld8.fill r19=[r3],16
22215 +       ;;
22216 +       ld8.fill r20=[r2],16
22217 +       ld8.fill r21=[r3],16
22218 +       mov ar.csd=r30
22219 +       mov ar.ssd=r31
22220 +       ;;
22221 +#ifdef CONFIG_XEN
22222 +       movl r22=XSI_PSR_IC
22223 +       ;;
22224 +       st8 [r22]=r0            // note: clears both vpsr.i and vpsr.ic!
22225 +       ;;
22226 +#else
22227 +       rsm psr.i | psr.ic      // initiate turning off of interrupt and interruption collection
22228 +#endif
22229 +       invala                  // invalidate ALAT
22230 +       ;;
22231 +       ld8.fill r22=[r2],24
22232 +       ld8.fill r23=[r3],24
22233 +       mov b6=r28
22234 +       ;;
22235 +       ld8.fill r25=[r2],16
22236 +       ld8.fill r26=[r3],16
22237 +       mov b7=r29
22238 +       ;;
22239 +       ld8.fill r27=[r2],16
22240 +       ld8.fill r28=[r3],16
22241 +       ;;
22242 +       ld8.fill r29=[r2],16
22243 +       ld8.fill r30=[r3],24
22244 +       ;;
22245 +       ld8.fill r31=[r2],PT(F9)-PT(R31)
22246 +       adds r3=PT(F10)-PT(F6),r3
22247 +       ;;
22248 +       ldf.fill f9=[r2],PT(F6)-PT(F9)
22249 +       ldf.fill f10=[r3],PT(F8)-PT(F10)
22250 +       ;;
22251 +       ldf.fill f6=[r2],PT(F7)-PT(F6)
22252 +       ;;
22253 +       ldf.fill f7=[r2],PT(F11)-PT(F7)
22254 +       ldf.fill f8=[r3],32
22255 +       ;;
22256 +       srlz.i                  // ensure interruption collection is off
22257 +       mov ar.ccv=r15
22258 +       ;;
22259 +       ldf.fill f11=[r2]
22260 +#ifdef CONFIG_XEN
22261 +       ;;
22262 +       // r16-r31 all now hold bank1 values
22263 +       movl r2=XSI_BANK1_R16
22264 +       movl r3=XSI_BANK1_R16+8
22265 +       ;;
22266 +       st8.spill [r2]=r16,16
22267 +       st8.spill [r3]=r17,16
22268 +       ;;
22269 +       st8.spill [r2]=r18,16
22270 +       st8.spill [r3]=r19,16
22271 +       ;;
22272 +       st8.spill [r2]=r20,16
22273 +       st8.spill [r3]=r21,16
22274 +       ;;
22275 +       st8.spill [r2]=r22,16
22276 +       st8.spill [r3]=r23,16
22277 +       ;;
22278 +       st8.spill [r2]=r24,16
22279 +       st8.spill [r3]=r25,16
22280 +       ;;
22281 +       st8.spill [r2]=r26,16
22282 +       st8.spill [r3]=r27,16
22283 +       ;;
22284 +       st8.spill [r2]=r28,16
22285 +       st8.spill [r3]=r29,16
22286 +       ;;
22287 +       st8.spill [r2]=r30,16
22288 +       st8.spill [r3]=r31,16
22289 +       ;;
22290 +       movl r2=XSI_BANKNUM;;
22291 +       st4 [r2]=r0;
22292 +#else
22293 +       bsw.0                   // switch back to bank 0 (no stop bit required beforehand...)
22294 +#endif
22295 +       ;;
22296 +(pUStk)        mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
22297 +       adds r16=PT(CR_IPSR)+16,r12
22298 +       adds r17=PT(CR_IIP)+16,r12
22299 +
22300 +(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
22301 +       nop.i 0
22302 +       nop.i 0
22303 +       ;;
22304 +       ld8 r29=[r16],16        // load cr.ipsr
22305 +       ld8 r28=[r17],16        // load cr.iip
22306 +       ;;
22307 +       ld8 r30=[r16],16        // load cr.ifs
22308 +       ld8 r25=[r17],16        // load ar.unat
22309 +       ;;
22310 +       ld8 r26=[r16],16        // load ar.pfs
22311 +       ld8 r27=[r17],16        // load ar.rsc
22312 +       cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
22313 +       ;;
22314 +       ld8 r24=[r16],16        // load ar.rnat (may be garbage)
22315 +       ld8 r23=[r17],16        // load ar.bspstore (may be garbage)
22316 +       ;;
22317 +       ld8 r31=[r16],16        // load predicates
22318 +       ld8 r21=[r17],16        // load b0
22319 +       ;;
22320 +       ld8 r19=[r16],16        // load ar.rsc value for "loadrs"
22321 +       ld8.fill r1=[r17],16    // load r1
22322 +       ;;
22323 +       ld8.fill r12=[r16],16
22324 +       ld8.fill r13=[r17],16
22325 +(pUStk)        adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
22326 +       ;;
22327 +       ld8 r20=[r16],16        // ar.fpsr
22328 +       ld8.fill r15=[r17],16
22329 +       ;;
22330 +       ld8.fill r14=[r16],16
22331 +       ld8.fill r2=[r17]
22332 +(pUStk)        mov r17=1
22333 +       ;;
22334 +       ld8.fill r3=[r16]
22335 +(pUStk)        st1 [r18]=r17           // restore current->thread.on_ustack
22336 +       shr.u r18=r19,16        // get byte size of existing "dirty" partition
22337 +       ;;
22338 +       mov r16=ar.bsp          // get existing backing store pointer
22339 +       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
22340 +       ;;
22341 +       ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
22342 +(pKStk)        br.cond.dpnt skip_rbs_switch
22343 +
22344 +       /*
22345 +        * Restore user backing store.
22346 +        *
22347 +        * NOTE: alloc, loadrs, and cover can't be predicated.
22348 +        */
22349 +(pNonSys) br.cond.dpnt dont_preserve_current_frame
22350 +
22351 +rbs_switch:
22352 +#ifdef CONFIG_XEN
22353 +       XEN_HYPER_COVER;
22354 +#else
22355 +       cover                           // add current frame into dirty partition and set cr.ifs
22356 +#endif
22357 +       ;;
22358 +       mov r19=ar.bsp                  // get new backing store pointer
22359 +       sub r16=r16,r18                 // krbs = old bsp - size of dirty partition
22360 +       cmp.ne p9,p0=r0,r0              // clear p9 to skip restore of cr.ifs
22361 +       ;;
22362 +       sub r19=r19,r16                 // calculate total byte size of dirty partition
22363 +       add r18=64,r18                  // don't force in0-in7 into memory...
22364 +       ;;
22365 +       shl r19=r19,16                  // shift size of dirty partition into loadrs position
22366 +       ;;
22367 +dont_preserve_current_frame:
22368 +       /*
22369 +        * To prevent leaking bits between the kernel and user-space,
22370 +        * we must clear the stacked registers in the "invalid" partition here.
22371 +        * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
22372 +        * 5 registers/cycle on McKinley).
22373 +        */
22374 +#      define pRecurse p6
22375 +#      define pReturn  p7
22376 +#ifdef CONFIG_ITANIUM
22377 +#      define Nregs    10
22378 +#else
22379 +#      define Nregs    14
22380 +#endif
22381 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
22382 +       shr.u loc1=r18,9                // RNaTslots <= floor(dirtySize / (64*8))
22383 +       sub r17=r17,r18                 // r17 = (physStackedSize + 8) - dirtySize
22384 +       ;;
22385 +       mov ar.rsc=r19                  // load ar.rsc to be used for "loadrs"
22386 +       shladd in0=loc1,3,r17
22387 +       mov in1=0
22388 +       ;;
22389 +       TEXT_ALIGN(32)
22390 +rse_clear_invalid:
22391 +#ifdef CONFIG_ITANIUM
22392 +       // cycle 0
22393 + { .mii
22394 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
22395 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
22396 +       add out0=-Nregs*8,in0
22397 +}{ .mfb
22398 +       add out1=1,in1                  // increment recursion count
22399 +       nop.f 0
22400 +       nop.b 0                         // can't do br.call here because of alloc (WAW on CFM)
22401 +       ;;
22402 +}{ .mfi        // cycle 1
22403 +       mov loc1=0
22404 +       nop.f 0
22405 +       mov loc2=0
22406 +}{ .mib
22407 +       mov loc3=0
22408 +       mov loc4=0
22409 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid
22410 +
22411 +}{ .mfi        // cycle 2
22412 +       mov loc5=0
22413 +       nop.f 0
22414 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
22415 +}{ .mib
22416 +       mov loc6=0
22417 +       mov loc7=0
22418 +(pReturn) br.ret.sptk.many b0
22419 +}
22420 +#else /* !CONFIG_ITANIUM */
22421 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
22422 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
22423 +       add out0=-Nregs*8,in0
22424 +       add out1=1,in1                  // increment recursion count
22425 +       mov loc1=0
22426 +       mov loc2=0
22427 +       ;;
22428 +       mov loc3=0
22429 +       mov loc4=0
22430 +       mov loc5=0
22431 +       mov loc6=0
22432 +       mov loc7=0
22433 +(pRecurse) br.call.sptk.few b0=rse_clear_invalid
22434 +       ;;
22435 +       mov loc8=0
22436 +       mov loc9=0
22437 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
22438 +       mov loc10=0
22439 +       mov loc11=0
22440 +(pReturn) br.ret.sptk.many b0
22441 +#endif /* !CONFIG_ITANIUM */
22442 +#      undef pRecurse
22443 +#      undef pReturn
22444 +       ;;
22445 +       alloc r17=ar.pfs,0,0,0,0        // drop current register frame
22446 +       ;;
22447 +       loadrs
22448 +       ;;
22449 +skip_rbs_switch:
22450 +       mov ar.unat=r25         // M2
22451 +(pKStk)        extr.u r22=r22,21,1     // I0 extract current value of psr.pp from r22
22452 +(pLvSys)mov r19=r0             // A  clear r19 for leave_syscall, no-op otherwise
22453 +       ;;
22454 +(pUStk)        mov ar.bspstore=r23     // M2
22455 +(pKStk)        dep r29=r22,r29,21,1    // I0 update ipsr.pp with psr.pp
22456 +(pLvSys)mov r16=r0             // A  clear r16 for leave_syscall, no-op otherwise
22457 +       ;;
22458 +#ifdef CONFIG_XEN
22459 +       movl r25=XSI_IPSR
22460 +       ;;
22461 +       st8[r25]=r29,XSI_IFS-XSI_IPSR
22462 +       ;;
22463 +#else
22464 +       mov cr.ipsr=r29         // M2
22465 +#endif
22466 +       mov ar.pfs=r26          // I0
22467 +(pLvSys)mov r17=r0             // A  clear r17 for leave_syscall, no-op otherwise
22468 +
22469 +#ifdef CONFIG_XEN
22470 +(p9)   st8 [r25]=r30
22471 +       ;;
22472 +       adds r25=XSI_IIP-XSI_IFS,r25
22473 +       ;;
22474 +#else
22475 +(p9)   mov cr.ifs=r30          // M2
22476 +#endif
22477 +       mov b0=r21              // I0
22478 +(pLvSys)mov r18=r0             // A  clear r18 for leave_syscall, no-op otherwise
22479 +
22480 +       mov ar.fpsr=r20         // M2
22481 +#ifdef CONFIG_XEN
22482 +       st8     [r25]=r28
22483 +#else
22484 +       mov cr.iip=r28          // M2
22485 +#endif
22486 +       nop 0
22487 +       ;;
22488 +(pUStk)        mov ar.rnat=r24         // M2 must happen with RSE in lazy mode
22489 +       nop 0
22490 +(pLvSys)mov r2=r0
22491 +
22492 +       mov ar.rsc=r27          // M2
22493 +       mov pr=r31,-1           // I0
22494 +#ifdef CONFIG_XEN
22495 +       ;;
22496 +       XEN_HYPER_RFI;
22497 +#else
22498 +       rfi                     // B
22499 +#endif
22500 +
22501 +       /*
22502 +        * On entry:
22503 +        *      r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
22504 +        *      r31 = current->thread_info->flags
22505 +        * On exit:
22506 +        *      p6 = TRUE if work-pending-check needs to be redone
22507 +        */
22508 +.work_pending_syscall:
22509 +       add r2=-8,r2
22510 +       add r3=-8,r3
22511 +       ;;
22512 +       st8 [r2]=r8
22513 +       st8 [r3]=r10
22514 +.work_pending:
22515 +       tbit.nz p6,p0=r31,TIF_SIGDELAYED                // signal delayed from  MCA/INIT/NMI/PMI context?
22516 +(p6)   br.cond.sptk.few .sigdelayed
22517 +       ;;
22518 +       tbit.z p6,p0=r31,TIF_NEED_RESCHED               // current_thread_info()->need_resched==0?
22519 +(p6)   br.cond.sptk.few .notify
22520 +#ifdef CONFIG_PREEMPT
22521 +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
22522 +       ;;
22523 +(pKStk) st4 [r20]=r21
22524 +       ssm psr.i               // enable interrupts
22525 +#endif
22526 +       br.call.spnt.many rp=schedule
22527 +.ret9: cmp.eq p6,p0=r0,r0                              // p6 <- 1
22528 +#ifdef CONFIG_XEN
22529 +       movl r2=XSI_PSR_I
22530 +       ;;
22531 +       st4 [r2]=r0
22532 +#else
22533 +       rsm psr.i               // disable interrupts
22534 +#endif
22535 +       ;;
22536 +#ifdef CONFIG_PREEMPT
22537 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
22538 +       ;;
22539 +(pKStk)        st4 [r20]=r0            // preempt_count() <- 0
22540 +#endif
22541 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
22542 +       br.cond.sptk.many .work_processed_kernel        // re-check
22543 +
22544 +.notify:
22545 +(pUStk)        br.call.spnt.many rp=notify_resume_user
22546 +.ret10:        cmp.ne p6,p0=r0,r0                              // p6 <- 0
22547 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
22548 +       br.cond.sptk.many .work_processed_kernel        // don't re-check
22549 +
22550 +// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
22551 +// it could not be delivered.  Deliver it now.  The signal might be for us and
22552 +// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
22553 +// signal.
22554 +
22555 +.sigdelayed:
22556 +       br.call.sptk.many rp=do_sigdelayed
22557 +       cmp.eq p6,p0=r0,r0                              // p6 <- 1, always re-check
22558 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
22559 +       br.cond.sptk.many .work_processed_kernel        // re-check
22560 +
22561 +.work_pending_syscall_end:
22562 +       adds r2=PT(R8)+16,r12
22563 +       adds r3=PT(R10)+16,r12
22564 +       ;;
22565 +       ld8 r8=[r2]
22566 +       ld8 r10=[r3]
22567 +       br.cond.sptk.many .work_processed_syscall       // re-check
22568 +
22569 +#ifdef CONFIG_XEN
22570 +END(xen_leave_kernel)
22571 +#else
22572 +END(ia64_leave_kernel)
22573 +#endif
22574 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xenhpski.c tmp-linux-2.6-xen.patch/arch/ia64/xen/xenhpski.c
22575 --- ref-linux-2.6.16.9/arch/ia64/xen/xenhpski.c 1970-01-01 01:00:00.000000000 +0100
22576 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xenhpski.c    2006-04-10 00:05:52.000000000 +0200
22577 @@ -0,0 +1,19 @@
22578 +
22579 +extern unsigned long xen_get_cpuid(int);
22580 +
22581 +int
22582 +running_on_sim(void)
22583 +{
22584 +       int i;
22585 +       long cpuid[6];
22586 +
22587 +       for (i = 0; i < 5; ++i)
22588 +               cpuid[i] = xen_get_cpuid(i);
22589 +       if ((cpuid[0] & 0xff) != 'H') return 0;
22590 +       if ((cpuid[3] & 0xff) != 0x4) return 0;
22591 +       if (((cpuid[3] >> 8) & 0xff) != 0x0) return 0;
22592 +       if (((cpuid[3] >> 16) & 0xff) != 0x0) return 0;
22593 +       if (((cpuid[3] >> 24) & 0x7) != 0x7) return 0;
22594 +       return 1;
22595 +}
22596 +
22597 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xenivt.S tmp-linux-2.6-xen.patch/arch/ia64/xen/xenivt.S
22598 --- ref-linux-2.6.16.9/arch/ia64/xen/xenivt.S   1970-01-01 01:00:00.000000000 +0100
22599 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xenivt.S      2006-04-10 00:05:52.000000000 +0200
22600 @@ -0,0 +1,2032 @@
22601 +/*
22602 + * arch/ia64/xen/ivt.S
22603 + *
22604 + * Copyright (C) 2005 Hewlett-Packard Co
22605 + *     Dan Magenheimer <dan.magenheimer@hp.com>
22606 + */
22607 +/*
22608 + * This file defines the interruption vector table used by the CPU.
22609 + * It does not include one entry per possible cause of interruption.
22610 + *
22611 + * The first 20 entries of the table contain 64 bundles each while the
22612 + * remaining 48 entries contain only 16 bundles each.
22613 + *
22614 + * The 64 bundles are used to allow inlining the whole handler for critical
22615 + * interruptions like TLB misses.
22616 + *
22617 + *  For each entry, the comment is as follows:
22618 + *
22619 + *             // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
22620 + *  entry offset ----/     /         /                  /          /
22621 + *  entry number ---------/         /                  /          /
22622 + *  size of the entry -------------/                  /          /
22623 + *  vector name -------------------------------------/          /
22624 + *  interruptions triggering this vector ----------------------/
22625 + *
22626 + * The table is 32KB in size and must be aligned on 32KB boundary.
22627 + * (The CPU ignores the 15 lower bits of the address)
22628 + *
22629 + * Table is based upon EAS2.6 (Oct 1999)
22630 + */
22631 +
22632 +#include <linux/config.h>
22633 +
22634 +#include <asm/asmmacro.h>
22635 +#include <asm/break.h>
22636 +#include <asm/ia32.h>
22637 +#include <asm/kregs.h>
22638 +#include <asm/asm-offsets.h>
22639 +#include <asm/pgtable.h>
22640 +#include <asm/processor.h>
22641 +#include <asm/ptrace.h>
22642 +#include <asm/system.h>
22643 +#include <asm/thread_info.h>
22644 +#include <asm/unistd.h>
22645 +#include <asm/errno.h>
22646 +
22647 +#ifdef CONFIG_XEN
22648 +#define ia64_ivt xen_ivt
22649 +#endif
22650 +
22651 +#if 1
22652 +# define PSR_DEFAULT_BITS      psr.ac
22653 +#else
22654 +# define PSR_DEFAULT_BITS      0
22655 +#endif
22656 +
22657 +#if 0
22658 +  /*
22659 +   * This lets you track the last eight faults that occurred on the CPU.  Make sure ar.k2 isn't
22660 +   * needed for something else before enabling this...
22661 +   */
22662 +# define DBG_FAULT(i)  mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
22663 +#else
22664 +# define DBG_FAULT(i)
22665 +#endif
22666 +
22667 +#define MINSTATE_VIRT  /* needed by minstate.h */
22668 +#include "xenminstate.h"
22669 +
22670 +#define FAULT(n)                                                                       \
22671 +       mov r31=pr;                                                                     \
22672 +       mov r19=n;;                     /* prepare to save predicates */                \
22673 +       br.sptk.many dispatch_to_fault_handler
22674 +
22675 +       .section .text.ivt,"ax"
22676 +
22677 +       .align 32768    // align on 32KB boundary
22678 +       .global ia64_ivt
22679 +ia64_ivt:
22680 +/////////////////////////////////////////////////////////////////////////////////////////
22681 +// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
22682 +ENTRY(vhpt_miss)
22683 +       DBG_FAULT(0)
22684 +       /*
22685 +        * The VHPT vector is invoked when the TLB entry for the virtual page table
22686 +        * is missing.  This happens only as a result of a previous
22687 +        * (the "original") TLB miss, which may either be caused by an instruction
22688 +        * fetch or a data access (or non-access).
22689 +        *
22690 +        * What we do here is normal TLB miss handing for the _original_ miss, followed
22691 +        * by inserting the TLB entry for the virtual page table page that the VHPT
22692 +        * walker was attempting to access.  The latter gets inserted as long
22693 +        * as both L1 and L2 have valid mappings for the faulting address.
22694 +        * The TLB entry for the original miss gets inserted only if
22695 +        * the L3 entry indicates that the page is present.
22696 +        *
22697 +        * do_page_fault gets invoked in the following cases:
22698 +        *      - the faulting virtual address uses unimplemented address bits
22699 +        *      - the faulting virtual address has no L1, L2, or L3 mapping
22700 +        */
22701 +#ifdef CONFIG_XEN
22702 +       movl r16=XSI_IFA
22703 +       ;;
22704 +       ld8 r16=[r16]
22705 +#ifdef CONFIG_HUGETLB_PAGE
22706 +       movl r18=PAGE_SHIFT
22707 +       movl r25=XSI_ITIR
22708 +       ;;
22709 +       ld8 r25=[r25]
22710 +#endif
22711 +       ;;
22712 +#else
22713 +       mov r16=cr.ifa                          // get address that caused the TLB miss
22714 +#ifdef CONFIG_HUGETLB_PAGE
22715 +       movl r18=PAGE_SHIFT
22716 +       mov r25=cr.itir
22717 +#endif
22718 +#endif
22719 +       ;;
22720 +#ifdef CONFIG_XEN
22721 +       XEN_HYPER_RSM_PSR_DT;
22722 +#else
22723 +       rsm psr.dt                              // use physical addressing for data
22724 +#endif
22725 +       mov r31=pr                              // save the predicate registers
22726 +       mov r19=IA64_KR(PT_BASE)                // get page table base address
22727 +       shl r21=r16,3                           // shift bit 60 into sign bit
22728 +       shr.u r17=r16,61                        // get the region number into r17
22729 +       ;;
22730 +       shr r22=r21,3
22731 +#ifdef CONFIG_HUGETLB_PAGE
22732 +       extr.u r26=r25,2,6
22733 +       ;;
22734 +       cmp.ne p8,p0=r18,r26
22735 +       sub r27=r26,r18
22736 +       ;;
22737 +(p8)   dep r25=r18,r25,2,6
22738 +(p8)   shr r22=r22,r27
22739 +#endif
22740 +       ;;
22741 +       cmp.eq p6,p7=5,r17                      // is IFA pointing into to region 5?
22742 +       shr.u r18=r22,PGDIR_SHIFT               // get bits 33-63 of the faulting address
22743 +       ;;
22744 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
22745 +
22746 +       srlz.d
22747 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
22748 +
22749 +       .pred.rel "mutex", p6, p7
22750 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
22751 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
22752 +       ;;
22753 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=PTA + IFA(33,42)*8
22754 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
22755 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
22756 +       shr.u r18=r22,PMD_SHIFT                 // shift L2 index into position
22757 +       ;;
22758 +       ld8 r17=[r17]                           // fetch the L1 entry (may be 0)
22759 +       ;;
22760 +(p7)   cmp.eq p6,p7=r17,r0                     // was L1 entry NULL?
22761 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // compute address of L2 page table entry
22762 +       ;;
22763 +(p7)   ld8 r20=[r17]                           // fetch the L2 entry (may be 0)
22764 +       shr.u r19=r22,PAGE_SHIFT                // shift L3 index into position
22765 +       ;;
22766 +(p7)   cmp.eq.or.andcm p6,p7=r20,r0            // was L2 entry NULL?
22767 +       dep r21=r19,r20,3,(PAGE_SHIFT-3)        // compute address of L3 page table entry
22768 +       ;;
22769 +#ifdef CONFIG_XEN
22770 +(p7)   ld8 r18=[r21]                           // read the L3 PTE
22771 +       movl r19=XSI_ISR
22772 +       ;;
22773 +       ld8 r19=[r19]
22774 +       ;;
22775 +(p7)   tbit.z p6,p7=r18,_PAGE_P_BIT            // page present bit cleared?
22776 +       movl r22=XSI_IHA
22777 +       ;;
22778 +       ld8 r22=[r22]
22779 +       ;;
22780 +#else
22781 +(p7)   ld8 r18=[r21]                           // read the L3 PTE
22782 +       mov r19=cr.isr                          // cr.isr bit 0 tells us if this is an insn miss
22783 +       ;;
22784 +(p7)   tbit.z p6,p7=r18,_PAGE_P_BIT            // page present bit cleared?
22785 +       mov r22=cr.iha                          // get the VHPT address that caused the TLB miss
22786 +       ;;                                      // avoid RAW on p7
22787 +#endif
22788 +(p7)   tbit.nz.unc p10,p11=r19,32              // is it an instruction TLB miss?
22789 +       dep r23=0,r20,0,PAGE_SHIFT              // clear low bits to get page address
22790 +       ;;
22791 +#ifdef CONFIG_XEN
22792 +       mov r24=r8
22793 +       mov r8=r18
22794 +       ;;
22795 +(p10)  XEN_HYPER_ITC_D
22796 +       ;;
22797 +(p11)  XEN_HYPER_ITC_I
22798 +       ;;
22799 +       mov r8=r24
22800 +       ;;
22801 +(p6)   br.cond.spnt.many page_fault            // handle bad address/page not present (page fault)
22802 +       ;;
22803 +       movl r24=XSI_IFA
22804 +       ;;
22805 +       st8 [r24]=r22
22806 +       ;;
22807 +#else
22808 +(p10)  itc.i r18                               // insert the instruction TLB entry
22809 +(p11)  itc.d r18                               // insert the data TLB entry
22810 +(p6)   br.cond.spnt.many page_fault            // handle bad address/page not present (page fault)
22811 +       mov cr.ifa=r22
22812 +#endif
22813 +
22814 +#ifdef CONFIG_HUGETLB_PAGE
22815 +(p8)   mov cr.itir=r25                         // change to default page-size for VHPT
22816 +#endif
22817 +
22818 +       /*
22819 +        * Now compute and insert the TLB entry for the virtual page table.  We never
22820 +        * execute in a page table page so there is no need to set the exception deferral
22821 +        * bit.
22822 +        */
22823 +       adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
22824 +       ;;
22825 +#ifdef CONFIG_XEN
22826 +(p7)   mov r25=r8
22827 +(p7)   mov r8=r24
22828 +       ;;
22829 +(p7)   XEN_HYPER_ITC_D
22830 +       ;;
22831 +(p7)   mov r8=r25
22832 +       ;;
22833 +#else
22834 +(p7)   itc.d r24
22835 +#endif
22836 +       ;;
22837 +#ifdef CONFIG_SMP
22838 +       /*
22839 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
22840 +        * cannot possibly affect the following loads:
22841 +        */
22842 +       dv_serialize_data
22843 +
22844 +       /*
22845 +        * Re-check L2 and L3 pagetable.  If they changed, we may have received a ptc.g
22846 +        * between reading the pagetable and the "itc".  If so, flush the entry we
22847 +        * inserted and retry.
22848 +        */
22849 +       ld8 r25=[r21]                           // read L3 PTE again
22850 +       ld8 r26=[r17]                           // read L2 entry again
22851 +       ;;
22852 +       cmp.ne p6,p7=r26,r20                    // did L2 entry change
22853 +       mov r27=PAGE_SHIFT<<2
22854 +       ;;
22855 +(p6)   ptc.l r22,r27                           // purge PTE page translation
22856 +(p7)   cmp.ne.or.andcm p6,p7=r25,r18           // did L3 PTE change
22857 +       ;;
22858 +(p6)   ptc.l r16,r27                           // purge translation
22859 +#endif
22860 +
22861 +       mov pr=r31,-1                           // restore predicate registers
22862 +#ifdef CONFIG_XEN
22863 +       XEN_HYPER_RFI;
22864 +#else
22865 +       rfi
22866 +#endif
22867 +END(vhpt_miss)
22868 +
22869 +       .org ia64_ivt+0x400
22870 +/////////////////////////////////////////////////////////////////////////////////////////
22871 +// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
22872 +ENTRY(itlb_miss)
22873 +       DBG_FAULT(1)
22874 +       /*
22875 +        * The ITLB handler accesses the L3 PTE via the virtually mapped linear
22876 +        * page table.  If a nested TLB miss occurs, we switch into physical
22877 +        * mode, walk the page table, and then re-execute the L3 PTE read
22878 +        * and go on normally after that.
22879 +        */
22880 +#ifdef CONFIG_XEN
22881 +       movl r16=XSI_IFA
22882 +       ;;
22883 +       ld8 r16=[r16]
22884 +#else
22885 +       mov r16=cr.ifa                          // get virtual address
22886 +#endif
22887 +       mov r29=b0                              // save b0
22888 +       mov r31=pr                              // save predicates
22889 +.itlb_fault:
22890 +#ifdef CONFIG_XEN
22891 +       movl r17=XSI_IHA
22892 +       ;;
22893 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
22894 +#else
22895 +       mov r17=cr.iha                          // get virtual address of L3 PTE
22896 +#endif
22897 +       movl r30=1f                             // load nested fault continuation point
22898 +       ;;
22899 +1:     ld8 r18=[r17]                           // read L3 PTE
22900 +       ;;
22901 +       mov b0=r29
22902 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
22903 +(p6)   br.cond.spnt page_fault
22904 +       ;;
22905 +#ifdef CONFIG_XEN
22906 +       mov r19=r8
22907 +       mov r8=r18
22908 +       ;;
22909 +       XEN_HYPER_ITC_I
22910 +       ;;
22911 +       mov r8=r19
22912 +#else
22913 +       itc.i r18
22914 +#endif
22915 +       ;;
22916 +#ifdef CONFIG_SMP
22917 +       /*
22918 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
22919 +        * cannot possibly affect the following loads:
22920 +        */
22921 +       dv_serialize_data
22922 +
22923 +       ld8 r19=[r17]                           // read L3 PTE again and see if same
22924 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
22925 +       ;;
22926 +       cmp.ne p7,p0=r18,r19
22927 +       ;;
22928 +(p7)   ptc.l r16,r20
22929 +#endif
22930 +       mov pr=r31,-1
22931 +#ifdef CONFIG_XEN
22932 +       XEN_HYPER_RFI;
22933 +#else
22934 +       rfi
22935 +#endif
22936 +END(itlb_miss)
22937 +
22938 +       .org ia64_ivt+0x0800
22939 +/////////////////////////////////////////////////////////////////////////////////////////
22940 +// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
22941 +ENTRY(dtlb_miss)
22942 +       DBG_FAULT(2)
22943 +       /*
22944 +        * The DTLB handler accesses the L3 PTE via the virtually mapped linear
22945 +        * page table.  If a nested TLB miss occurs, we switch into physical
22946 +        * mode, walk the page table, and then re-execute the L3 PTE read
22947 +        * and go on normally after that.
22948 +        */
22949 +#ifdef CONFIG_XEN
22950 +       movl r16=XSI_IFA
22951 +       ;;
22952 +       ld8 r16=[r16]
22953 +#else
22954 +       mov r16=cr.ifa                          // get virtual address
22955 +#endif
22956 +       mov r29=b0                              // save b0
22957 +       mov r31=pr                              // save predicates
22958 +dtlb_fault:
22959 +#ifdef CONFIG_XEN
22960 +       movl r17=XSI_IHA
22961 +       ;;
22962 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
22963 +#else
22964 +       mov r17=cr.iha                          // get virtual address of L3 PTE
22965 +#endif
22966 +       movl r30=1f                             // load nested fault continuation point
22967 +       ;;
22968 +1:     ld8 r18=[r17]                           // read L3 PTE
22969 +       ;;
22970 +       mov b0=r29
22971 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
22972 +(p6)   br.cond.spnt page_fault
22973 +       ;;
22974 +#ifdef CONFIG_XEN
22975 +       mov r19=r8
22976 +       mov r8=r18
22977 +       ;;
22978 +       XEN_HYPER_ITC_D
22979 +       ;;
22980 +       mov r8=r19
22981 +       ;;
22982 +#else
22983 +       itc.d r18
22984 +#endif
22985 +       ;;
22986 +#ifdef CONFIG_SMP
22987 +       /*
22988 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
22989 +        * cannot possibly affect the following loads:
22990 +        */
22991 +       dv_serialize_data
22992 +
22993 +       ld8 r19=[r17]                           // read L3 PTE again and see if same
22994 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
22995 +       ;;
22996 +       cmp.ne p7,p0=r18,r19
22997 +       ;;
22998 +(p7)   ptc.l r16,r20
22999 +#endif
23000 +       mov pr=r31,-1
23001 +#ifdef CONFIG_XEN
23002 +       XEN_HYPER_RFI;
23003 +#else
23004 +       rfi
23005 +#endif
23006 +END(dtlb_miss)
23007 +
23008 +       .org ia64_ivt+0x0c00
23009 +/////////////////////////////////////////////////////////////////////////////////////////
23010 +// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
23011 +ENTRY(alt_itlb_miss)
23012 +       DBG_FAULT(3)
23013 +#ifdef CONFIG_XEN
23014 +       movl r31=XSI_IPSR
23015 +       ;;
23016 +       ld8 r21=[r31],XSI_IFA-XSI_IPSR  // get ipsr, point to ifa
23017 +       movl r17=PAGE_KERNEL
23018 +       ;;
23019 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23020 +       ;;
23021 +       ld8 r16=[r31]           // get ifa
23022 +       mov r31=pr
23023 +       ;;
23024 +#else
23025 +       mov r16=cr.ifa          // get address that caused the TLB miss
23026 +       movl r17=PAGE_KERNEL
23027 +       mov r21=cr.ipsr
23028 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23029 +       mov r31=pr
23030 +       ;;
23031 +#endif
23032 +#ifdef CONFIG_DISABLE_VHPT
23033 +       shr.u r22=r16,61                        // get the region number into r21
23034 +       ;;
23035 +       cmp.gt p8,p0=6,r22                      // user mode
23036 +       ;;
23037 +#ifndef CONFIG_XEN
23038 +(p8)   thash r17=r16
23039 +       ;;
23040 +(p8)   mov cr.iha=r17
23041 +#endif
23042 +(p8)   mov r29=b0                              // save b0
23043 +(p8)   br.cond.dptk .itlb_fault
23044 +#endif
23045 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
23046 +       and r19=r19,r16         // clear ed, reserved bits, and PTE control bits
23047 +       shr.u r18=r16,57        // move address bit 61 to bit 4
23048 +       ;;
23049 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
23050 +       cmp.ne p8,p0=r0,r23     // psr.cpl != 0?
23051 +       or r19=r17,r19          // insert PTE control bits into r19
23052 +       ;;
23053 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
23054 +(p8)   br.cond.spnt page_fault
23055 +       ;;
23056 +#ifdef CONFIG_XEN
23057 +       mov r18=r8
23058 +       mov r8=r19
23059 +       ;;
23060 +       XEN_HYPER_ITC_I
23061 +       ;;
23062 +       mov r8=r18
23063 +       ;;
23064 +       mov pr=r31,-1
23065 +       ;;
23066 +       XEN_HYPER_RFI;
23067 +#else
23068 +       itc.i r19               // insert the TLB entry
23069 +       mov pr=r31,-1
23070 +       rfi
23071 +#endif
23072 +END(alt_itlb_miss)
23073 +
23074 +       .org ia64_ivt+0x1000
23075 +/////////////////////////////////////////////////////////////////////////////////////////
23076 +// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
23077 +ENTRY(alt_dtlb_miss)
23078 +       DBG_FAULT(4)
23079 +#ifdef CONFIG_XEN
23080 +       movl r31=XSI_IPSR
23081 +       ;;
23082 +       ld8 r21=[r31],XSI_ISR-XSI_IPSR  // get ipsr, point to isr
23083 +       movl r17=PAGE_KERNEL
23084 +       ;;
23085 +       ld8 r20=[r31],XSI_IFA-XSI_ISR   // get isr, point to ifa
23086 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23087 +       ;;
23088 +       ld8 r16=[r31]           // get ifa
23089 +       mov r31=pr
23090 +       ;;
23091 +#else
23092 +       mov r16=cr.ifa          // get address that caused the TLB miss
23093 +       movl r17=PAGE_KERNEL
23094 +       mov r20=cr.isr
23095 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23096 +       mov r21=cr.ipsr
23097 +       mov r31=pr
23098 +       ;;
23099 +#endif
23100 +#ifdef CONFIG_DISABLE_VHPT
23101 +       shr.u r22=r16,61                        // get the region number into r21
23102 +       ;;
23103 +       cmp.gt p8,p0=6,r22                      // access to region 0-5
23104 +       ;;
23105 +#ifndef CONFIG_XEN
23106 +(p8)   thash r17=r16
23107 +       ;;
23108 +(p8)   mov cr.iha=r17
23109 +#endif
23110 +(p8)   mov r29=b0                              // save b0
23111 +(p8)   br.cond.dptk dtlb_fault
23112 +#endif
23113 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
23114 +       and r22=IA64_ISR_CODE_MASK,r20          // get the isr.code field
23115 +       tbit.nz p6,p7=r20,IA64_ISR_SP_BIT       // is speculation bit on?
23116 +       shr.u r18=r16,57                        // move address bit 61 to bit 4
23117 +       and r19=r19,r16                         // clear ed, reserved bits, and PTE control bits
23118 +       tbit.nz p9,p0=r20,IA64_ISR_NA_BIT       // is non-access bit on?
23119 +       ;;
23120 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
23121 +       cmp.ne p8,p0=r0,r23
23122 +(p9)   cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22  // check isr.code field
23123 +(p8)   br.cond.spnt page_fault
23124 +
23125 +       dep r21=-1,r21,IA64_PSR_ED_BIT,1
23126 +       or r19=r19,r17          // insert PTE control bits into r19
23127 +       ;;
23128 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
23129 +(p6)   mov cr.ipsr=r21
23130 +       ;;
23131 +#ifdef CONFIG_XEN
23132 +(p7)   mov r18=r8
23133 +(p7)   mov r8=r19
23134 +       ;;
23135 +(p7)   XEN_HYPER_ITC_D
23136 +       ;;
23137 +(p7)   mov r8=r18
23138 +       ;;
23139 +       mov pr=r31,-1
23140 +       ;;
23141 +       XEN_HYPER_RFI;
23142 +#else
23143 +(p7)   itc.d r19               // insert the TLB entry
23144 +       mov pr=r31,-1
23145 +       rfi
23146 +#endif
23147 +END(alt_dtlb_miss)
23148 +
23149 +       .org ia64_ivt+0x1400
23150 +/////////////////////////////////////////////////////////////////////////////////////////
23151 +// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
23152 +ENTRY(nested_dtlb_miss)
23153 +       /*
23154 +        * In the absence of kernel bugs, we get here when the virtually mapped linear
23155 +        * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
23156 +        * Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
23157 +        * table is missing, a nested TLB miss fault is triggered and control is
23158 +        * transferred to this point.  When this happens, we lookup the pte for the
23159 +        * faulting address by walking the page table in physical mode and return to the
23160 +        * continuation point passed in register r30 (or call page_fault if the address is
23161 +        * not mapped).
23162 +        *
23163 +        * Input:       r16:    faulting address
23164 +        *              r29:    saved b0
23165 +        *              r30:    continuation address
23166 +        *              r31:    saved pr
23167 +        *
23168 +        * Output:      r17:    physical address of L3 PTE of faulting address
23169 +        *              r29:    saved b0
23170 +        *              r30:    continuation address
23171 +        *              r31:    saved pr
23172 +        *
23173 +        * Clobbered:   b0, r18, r19, r21, psr.dt (cleared)
23174 +        */
23175 +#ifdef CONFIG_XEN
23176 +       XEN_HYPER_RSM_PSR_DT;
23177 +#else
23178 +       rsm psr.dt                              // switch to using physical data addressing
23179 +#endif
23180 +       mov r19=IA64_KR(PT_BASE)                // get the page table base address
23181 +       shl r21=r16,3                           // shift bit 60 into sign bit
23182 +       ;;
23183 +       shr.u r17=r16,61                        // get the region number into r17
23184 +       ;;
23185 +       cmp.eq p6,p7=5,r17                      // is faulting address in region 5?
23186 +       shr.u r18=r16,PGDIR_SHIFT               // get bits 33-63 of faulting address
23187 +       ;;
23188 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
23189 +
23190 +       srlz.d
23191 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
23192 +
23193 +       .pred.rel "mutex", p6, p7
23194 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
23195 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
23196 +       ;;
23197 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=PTA + IFA(33,42)*8
23198 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
23199 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
23200 +       shr.u r18=r16,PMD_SHIFT                 // shift L2 index into position
23201 +       ;;
23202 +       ld8 r17=[r17]                           // fetch the L1 entry (may be 0)
23203 +       ;;
23204 +(p7)   cmp.eq p6,p7=r17,r0                     // was L1 entry NULL?
23205 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // compute address of L2 page table entry
23206 +       ;;
23207 +(p7)   ld8 r17=[r17]                           // fetch the L2 entry (may be 0)
23208 +       shr.u r19=r16,PAGE_SHIFT                // shift L3 index into position
23209 +       ;;
23210 +(p7)   cmp.eq.or.andcm p6,p7=r17,r0            // was L2 entry NULL?
23211 +       dep r17=r19,r17,3,(PAGE_SHIFT-3)        // compute address of L3 page table entry
23212 +(p6)   br.cond.spnt page_fault
23213 +       mov b0=r30
23214 +       br.sptk.many b0                         // return to continuation point
23215 +END(nested_dtlb_miss)
23216 +
23217 +       .org ia64_ivt+0x1800
23218 +/////////////////////////////////////////////////////////////////////////////////////////
23219 +// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
23220 +ENTRY(ikey_miss)
23221 +       DBG_FAULT(6)
23222 +       FAULT(6)
23223 +END(ikey_miss)
23224 +
23225 +       //-----------------------------------------------------------------------------------
23226 +       // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
23227 +ENTRY(page_fault)
23228 +#ifdef CONFIG_XEN
23229 +       XEN_HYPER_SSM_PSR_DT;
23230 +#else
23231 +       ssm psr.dt
23232 +       ;;
23233 +       srlz.i
23234 +#endif
23235 +       ;;
23236 +       SAVE_MIN_WITH_COVER
23237 +       alloc r15=ar.pfs,0,0,3,0
23238 +#ifdef CONFIG_XEN
23239 +       movl r3=XSI_ISR
23240 +       ;;
23241 +       ld8 out1=[r3],XSI_IFA-XSI_ISR           // get vcr.isr, point to ifa
23242 +       ;;
23243 +       ld8 out0=[r3]                           // get vcr.ifa
23244 +       mov r14=1
23245 +       ;;
23246 +       add r3=XSI_PSR_IC-XSI_IFA, r3           // point to vpsr.ic
23247 +       ;;
23248 +       st4 [r3]=r14                            // vpsr.ic = 1
23249 +       adds r3=8,r2                            // set up second base pointer
23250 +       ;;
23251 +#else
23252 +       mov out0=cr.ifa
23253 +       mov out1=cr.isr
23254 +       adds r3=8,r2                            // set up second base pointer
23255 +       ;;
23256 +       ssm psr.ic | PSR_DEFAULT_BITS
23257 +       ;;
23258 +       srlz.i                                  // guarantee that interruption collectin is on
23259 +       ;;
23260 +#endif
23261 +#ifdef CONFIG_XEN
23262 +       br.cond.sptk.many       xen_page_fault
23263 +       ;;
23264 +done_xen_page_fault:
23265 +#endif
23266 +(p15)  ssm psr.i                               // restore psr.i
23267 +       movl r14=ia64_leave_kernel
23268 +       ;;
23269 +       SAVE_REST
23270 +       mov rp=r14
23271 +       ;;
23272 +       adds out2=16,r12                        // out2 = pointer to pt_regs
23273 +       br.call.sptk.many b6=ia64_do_page_fault // ignore return address
23274 +END(page_fault)
23275 +
23276 +       .org ia64_ivt+0x1c00
23277 +/////////////////////////////////////////////////////////////////////////////////////////
23278 +// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
23279 +ENTRY(dkey_miss)
23280 +       DBG_FAULT(7)
23281 +       FAULT(7)
23282 +#ifdef CONFIG_XEN
23283 +       // Leaving this code inline above results in an IVT section overflow
23284 +       // There is no particular reason for this code to be here...
23285 +xen_page_fault:
23286 +(p15)  movl r3=XSI_PSR_I
23287 +       ;;
23288 +(p15)  st4 [r3]=r14,XSI_PEND-XSI_PSR_I         // if (p15) vpsr.i = 1
23289 +       mov r14=r0
23290 +       ;;
23291 +(p15)  ld4 r14=[r3]                            // if (pending_interrupts)
23292 +       adds r3=8,r2                            // re-set up second base pointer
23293 +       ;;
23294 +(p15)  cmp.ne  p15,p0=r14,r0
23295 +       ;;
23296 +       br.cond.sptk.many done_xen_page_fault
23297 +       ;;
23298 +#endif
23299 +END(dkey_miss)
23300 +
23301 +       .org ia64_ivt+0x2000
23302 +/////////////////////////////////////////////////////////////////////////////////////////
23303 +// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
23304 +ENTRY(dirty_bit)
23305 +       DBG_FAULT(8)
23306 +       /*
23307 +        * What we do here is to simply turn on the dirty bit in the PTE.  We need to
23308 +        * update both the page-table and the TLB entry.  To efficiently access the PTE,
23309 +        * we address it through the virtual page table.  Most likely, the TLB entry for
23310 +        * the relevant virtual page table page is still present in the TLB so we can
23311 +        * normally do this without additional TLB misses.  In case the necessary virtual
23312 +        * page table TLB entry isn't present, we take a nested TLB miss hit where we look
23313 +        * up the physical address of the L3 PTE and then continue at label 1 below.
23314 +        */
23315 +#ifdef CONFIG_XEN
23316 +       movl r16=XSI_IFA
23317 +       ;;
23318 +       ld8 r16=[r16]
23319 +       ;;
23320 +#else
23321 +       mov r16=cr.ifa                          // get the address that caused the fault
23322 +#endif
23323 +       movl r30=1f                             // load continuation point in case of nested fault
23324 +       ;;
23325 +#ifdef CONFIG_XEN
23326 +       mov r18=r8;
23327 +       mov r8=r16;
23328 +       XEN_HYPER_THASH;;
23329 +       mov r17=r8;
23330 +       mov r8=r18;;
23331 +#else
23332 +       thash r17=r16                           // compute virtual address of L3 PTE
23333 +#endif
23334 +       mov r29=b0                              // save b0 in case of nested fault
23335 +       mov r31=pr                              // save pr
23336 +#ifdef CONFIG_SMP
23337 +       mov r28=ar.ccv                          // save ar.ccv
23338 +       ;;
23339 +1:     ld8 r18=[r17]
23340 +       ;;                                      // avoid RAW on r18
23341 +       mov ar.ccv=r18                          // set compare value for cmpxchg
23342 +       or r25=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
23343 +       ;;
23344 +       cmpxchg8.acq r26=[r17],r25,ar.ccv
23345 +       mov r24=PAGE_SHIFT<<2
23346 +       ;;
23347 +       cmp.eq p6,p7=r26,r18
23348 +       ;;
23349 +(p6)   itc.d r25                               // install updated PTE
23350 +       ;;
23351 +       /*
23352 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
23353 +        * cannot possibly affect the following loads:
23354 +        */
23355 +       dv_serialize_data
23356 +
23357 +       ld8 r18=[r17]                           // read PTE again
23358 +       ;;
23359 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
23360 +       ;;
23361 +(p7)   ptc.l r16,r24
23362 +       mov b0=r29                              // restore b0
23363 +       mov ar.ccv=r28
23364 +#else
23365 +       ;;
23366 +1:     ld8 r18=[r17]
23367 +       ;;                                      // avoid RAW on r18
23368 +       or r18=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
23369 +       mov b0=r29                              // restore b0
23370 +       ;;
23371 +       st8 [r17]=r18                           // store back updated PTE
23372 +       itc.d r18                               // install updated PTE
23373 +#endif
23374 +       mov pr=r31,-1                           // restore pr
23375 +#ifdef CONFIG_XEN
23376 +       XEN_HYPER_RFI;
23377 +#else
23378 +       rfi
23379 +#endif
23380 +END(dirty_bit)
23381 +
23382 +       .org ia64_ivt+0x2400
23383 +/////////////////////////////////////////////////////////////////////////////////////////
23384 +// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
23385 +ENTRY(iaccess_bit)
23386 +       DBG_FAULT(9)
23387 +       // Like Entry 8, except for instruction access
23388 +#ifdef CONFIG_XEN
23389 +       movl r16=XSI_IFA
23390 +       ;;
23391 +       ld8 r16=[r16]
23392 +       ;;
23393 +#else
23394 +       mov r16=cr.ifa                          // get the address that caused the fault
23395 +#endif
23396 +       movl r30=1f                             // load continuation point in case of nested fault
23397 +       mov r31=pr                              // save predicates
23398 +#ifdef CONFIG_ITANIUM
23399 +       /*
23400 +        * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
23401 +        */
23402 +       mov r17=cr.ipsr
23403 +       ;;
23404 +       mov r18=cr.iip
23405 +       tbit.z p6,p0=r17,IA64_PSR_IS_BIT        // IA64 instruction set?
23406 +       ;;
23407 +(p6)   mov r16=r18                             // if so, use cr.iip instead of cr.ifa
23408 +#endif /* CONFIG_ITANIUM */
23409 +       ;;
23410 +#ifdef CONFIG_XEN
23411 +       mov r18=r8;
23412 +       mov r8=r16;
23413 +       XEN_HYPER_THASH;;
23414 +       mov r17=r8;
23415 +       mov r8=r18;;
23416 +#else
23417 +       thash r17=r16                           // compute virtual address of L3 PTE
23418 +#endif
23419 +       mov r29=b0                              // save b0 in case of nested fault)
23420 +#ifdef CONFIG_SMP
23421 +       mov r28=ar.ccv                          // save ar.ccv
23422 +       ;;
23423 +1:     ld8 r18=[r17]
23424 +       ;;
23425 +       mov ar.ccv=r18                          // set compare value for cmpxchg
23426 +       or r25=_PAGE_A,r18                      // set the accessed bit
23427 +       ;;
23428 +       cmpxchg8.acq r26=[r17],r25,ar.ccv
23429 +       mov r24=PAGE_SHIFT<<2
23430 +       ;;
23431 +       cmp.eq p6,p7=r26,r18
23432 +       ;;
23433 +#ifdef CONFIG_XEN
23434 +       mov r26=r8
23435 +       mov r8=r25
23436 +       ;;
23437 +(p6)   XEN_HYPER_ITC_I
23438 +       ;;
23439 +       mov r8=r26
23440 +       ;;
23441 +#else
23442 +(p6)   itc.i r25                               // install updated PTE
23443 +#endif
23444 +       ;;
23445 +       /*
23446 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
23447 +        * cannot possibly affect the following loads:
23448 +        */
23449 +       dv_serialize_data
23450 +
23451 +       ld8 r18=[r17]                           // read PTE again
23452 +       ;;
23453 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
23454 +       ;;
23455 +(p7)   ptc.l r16,r24
23456 +       mov b0=r29                              // restore b0
23457 +       mov ar.ccv=r28
23458 +#else /* !CONFIG_SMP */
23459 +       ;;
23460 +1:     ld8 r18=[r17]
23461 +       ;;
23462 +       or r18=_PAGE_A,r18                      // set the accessed bit
23463 +       mov b0=r29                              // restore b0
23464 +       ;;
23465 +       st8 [r17]=r18                           // store back updated PTE
23466 +       itc.i r18                               // install updated PTE
23467 +#endif /* !CONFIG_SMP */
23468 +       mov pr=r31,-1
23469 +#ifdef CONFIG_XEN
23470 +       XEN_HYPER_RFI;
23471 +#else
23472 +       rfi
23473 +#endif
23474 +END(iaccess_bit)
23475 +
23476 +       .org ia64_ivt+0x2800
23477 +/////////////////////////////////////////////////////////////////////////////////////////
23478 +// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
23479 +ENTRY(daccess_bit)
23480 +       DBG_FAULT(10)
23481 +       // Like Entry 8, except for data access
23482 +#ifdef CONFIG_XEN
23483 +       movl r16=XSI_IFA
23484 +       ;;
23485 +       ld8 r16=[r16]
23486 +       ;;
23487 +#else
23488 +       mov r16=cr.ifa                          // get the address that caused the fault
23489 +#endif
23490 +       movl r30=1f                             // load continuation point in case of nested fault
23491 +       ;;
23492 +#ifdef CONFIG_XEN
23493 +       mov r18=r8;
23494 +       mov r8=r16;
23495 +       XEN_HYPER_THASH;;
23496 +       mov r17=r8;
23497 +       mov r8=r18;;
23498 +#else
23499 +       thash r17=r16                           // compute virtual address of L3 PTE
23500 +#endif
23501 +       mov r31=pr
23502 +       mov r29=b0                              // save b0 in case of nested fault)
23503 +#ifdef CONFIG_SMP
23504 +       mov r28=ar.ccv                          // save ar.ccv
23505 +       ;;
23506 +1:     ld8 r18=[r17]
23507 +       ;;                                      // avoid RAW on r18
23508 +       mov ar.ccv=r18                          // set compare value for cmpxchg
23509 +       or r25=_PAGE_A,r18                      // set the dirty bit
23510 +       ;;
23511 +       cmpxchg8.acq r26=[r17],r25,ar.ccv
23512 +       mov r24=PAGE_SHIFT<<2
23513 +       ;;
23514 +       cmp.eq p6,p7=r26,r18
23515 +       ;;
23516 +#ifdef CONFIG_XEN
23517 +       mov r26=r8
23518 +       mov r8=r25
23519 +       ;;
23520 +(p6)   XEN_HYPER_ITC_D
23521 +       ;;
23522 +       mov r8=r26
23523 +       ;;
23524 +#else
23525 +(p6)   itc.d r25                               // install updated PTE
23526 +#endif
23527 +       /*
23528 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
23529 +        * cannot possibly affect the following loads:
23530 +        */
23531 +       dv_serialize_data
23532 +       ;;
23533 +       ld8 r18=[r17]                           // read PTE again
23534 +       ;;
23535 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
23536 +       ;;
23537 +(p7)   ptc.l r16,r24
23538 +       mov ar.ccv=r28
23539 +#else
23540 +       ;;
23541 +1:     ld8 r18=[r17]
23542 +       ;;                                      // avoid RAW on r18
23543 +       or r18=_PAGE_A,r18                      // set the accessed bit
23544 +       ;;
23545 +       st8 [r17]=r18                           // store back updated PTE
23546 +       itc.d r18                               // install updated PTE
23547 +#endif
23548 +       mov b0=r29                              // restore b0
23549 +       mov pr=r31,-1
23550 +#ifdef CONFIG_XEN
23551 +       XEN_HYPER_RFI;
23552 +#else
23553 +       rfi
23554 +#endif
23555 +END(daccess_bit)
23556 +
23557 +       .org ia64_ivt+0x2c00
23558 +/////////////////////////////////////////////////////////////////////////////////////////
23559 +// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
23560 +ENTRY(break_fault)
23561 +       /*
23562 +        * The streamlined system call entry/exit paths only save/restore the initial part
23563 +        * of pt_regs.  This implies that the callers of system-calls must adhere to the
23564 +        * normal procedure calling conventions.
23565 +        *
23566 +        *   Registers to be saved & restored:
23567 +        *      CR registers: cr.ipsr, cr.iip, cr.ifs
23568 +        *      AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
23569 +        *      others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
23570 +        *   Registers to be restored only:
23571 +        *      r8-r11: output value from the system call.
23572 +        *
23573 +        * During system call exit, scratch registers (including r15) are modified/cleared
23574 +        * to prevent leaking bits from kernel to user level.
23575 +        */
23576 +       DBG_FAULT(11)
23577 +       mov r16=IA64_KR(CURRENT)                // r16 = current task; 12 cycle read lat.
23578 +#ifdef CONFIG_XEN
23579 +       movl r31=XSI_IPSR
23580 +       ;;
23581 +       ld8 r29=[r31],XSI_IIP-XSI_IPSR          // get ipsr, point to iip
23582 +       mov r18=__IA64_BREAK_SYSCALL
23583 +       mov r21=ar.fpsr
23584 +       ;;
23585 +       ld8 r28=[r31],XSI_IIM-XSI_IIP           // get iip, point to iim
23586 +       mov r19=b6
23587 +       mov r25=ar.unat
23588 +       ;;
23589 +       ld8 r17=[r31]                           // get iim
23590 +       mov r27=ar.rsc
23591 +       mov r26=ar.pfs
23592 +       ;;
23593 +#else
23594 +       mov r17=cr.iim
23595 +       mov r18=__IA64_BREAK_SYSCALL
23596 +       mov r21=ar.fpsr
23597 +       mov r29=cr.ipsr
23598 +       mov r19=b6
23599 +       mov r25=ar.unat
23600 +       mov r27=ar.rsc
23601 +       mov r26=ar.pfs
23602 +       mov r28=cr.iip
23603 +#endif
23604 +       mov r31=pr                              // prepare to save predicates
23605 +       mov r20=r1
23606 +       ;;
23607 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
23608 +       cmp.eq p0,p7=r18,r17                    // is this a system call? (p7 <- false, if so)
23609 +(p7)   br.cond.spnt non_syscall
23610 +       ;;
23611 +       ld1 r17=[r16]                           // load current->thread.on_ustack flag
23612 +       st1 [r16]=r0                            // clear current->thread.on_ustack flag
23613 +       add r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16   // set r1 for MINSTATE_START_SAVE_MIN_VIRT
23614 +       ;;
23615 +       invala
23616 +
23617 +       /* adjust return address so we skip over the break instruction: */
23618 +
23619 +       extr.u r8=r29,41,2                      // extract ei field from cr.ipsr
23620 +       ;;
23621 +       cmp.eq p6,p7=2,r8                       // isr.ei==2?
23622 +       mov r2=r1                               // setup r2 for ia64_syscall_setup
23623 +       ;;
23624 +(p6)   mov r8=0                                // clear ei to 0
23625 +(p6)   adds r28=16,r28                         // switch cr.iip to next bundle cr.ipsr.ei wrapped
23626 +(p7)   adds r8=1,r8                            // increment ei to next slot
23627 +       ;;
23628 +       cmp.eq pKStk,pUStk=r0,r17               // are we in kernel mode already?
23629 +       dep r29=r8,r29,41,2                     // insert new ei into cr.ipsr
23630 +       ;;
23631 +
23632 +       // switch from user to kernel RBS:
23633 +       MINSTATE_START_SAVE_MIN_VIRT
23634 +       br.call.sptk.many b7=ia64_syscall_setup
23635 +       ;;
23636 +#ifdef CONFIG_XEN
23637 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;;
23638 +#else
23639 +       MINSTATE_END_SAVE_MIN_VIRT              // switch to bank 1
23640 +#endif
23641 +#ifdef CONFIG_XEN
23642 +       movl r3=XSI_PSR_IC
23643 +       mov r16=1
23644 +       ;;
23645 +#if 1
23646 +       st4 [r3]=r16,XSI_PSR_I-XSI_PSR_IC       // vpsr.ic = 1
23647 +       ;;
23648 +(p15)  st4 [r3]=r16,XSI_PEND-XSI_PSR_I         // if (p15) vpsr.i = 1
23649 +       mov r16=r0
23650 +       ;;
23651 +(p15)  ld4 r16=[r3]                            // if (pending_interrupts)
23652 +       ;;
23653 +       cmp.ne  p6,p0=r16,r0
23654 +       ;;
23655 +(p6)   ssm     psr.i                           //   do a real ssm psr.i
23656 +       ;;
23657 +#else
23658 +//     st4 [r3]=r16,XSI_PSR_I-XSI_PSR_IC       // vpsr.ic = 1
23659 +       adds r3=XSI_PSR_I-XSI_PSR_IC,r3         // SKIP vpsr.ic = 1
23660 +       ;;
23661 +(p15)  st4 [r3]=r16,XSI_PEND-XSI_PSR_I         // if (p15) vpsr.i = 1
23662 +       mov r16=r0
23663 +       ;;
23664 +(p15)  ld4 r16=[r3]                            // if (pending_interrupts)
23665 +       ;;
23666 +       cmp.ne  p6,p0=r16,r0
23667 +       ;;
23668 +//(p6) ssm     psr.i                           //   do a real ssm psr.i
23669 +//(p6) XEN_HYPER_SSM_I;
23670 +(p6)   break 0x7;
23671 +       ;;
23672 +#endif
23673 +       mov r3=NR_syscalls - 1
23674 +       ;;
23675 +#else
23676 +       ssm psr.ic | PSR_DEFAULT_BITS
23677 +       ;;
23678 +       srlz.i                                  // guarantee that interruption collection is on
23679 +       mov r3=NR_syscalls - 1
23680 +       ;;
23681 +(p15)  ssm psr.i                               // restore psr.i
23682 +#endif
23683 +       // p10==true means out registers are more than 8 or r15's Nat is true
23684 +(p10)  br.cond.spnt.many ia64_ret_from_syscall
23685 +       ;;
23686 +       movl r16=sys_call_table
23687 +
23688 +       adds r15=-1024,r15                      // r15 contains the syscall number---subtract 1024
23689 +       movl r2=ia64_ret_from_syscall
23690 +       ;;
23691 +       shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
23692 +       cmp.leu p6,p7=r15,r3                    // (syscall > 0 && syscall < 1024 + NR_syscalls) ?
23693 +       mov rp=r2                               // set the real return addr
23694 +       ;;
23695 +(p6)   ld8 r20=[r20]                           // load address of syscall entry point
23696 +(p7)   movl r20=sys_ni_syscall
23697 +
23698 +       add r2=TI_FLAGS+IA64_TASK_SIZE,r13
23699 +       ;;
23700 +       ld4 r2=[r2]                             // r2 = current_thread_info()->flags
23701 +       ;;
23702 +       and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
23703 +       ;;
23704 +       cmp.eq p8,p0=r2,r0
23705 +       mov b6=r20
23706 +       ;;
23707 +(p8)   br.call.sptk.many b6=b6                 // ignore this return addr
23708 +       br.cond.sptk ia64_trace_syscall
23709 +       // NOT REACHED
23710 +END(break_fault)
23711 +
23712 +       .org ia64_ivt+0x3000
23713 +/////////////////////////////////////////////////////////////////////////////////////////
23714 +// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
23715 +ENTRY(interrupt)
23716 +       DBG_FAULT(12)
23717 +       mov r31=pr              // prepare to save predicates
23718 +       ;;
23719 +       SAVE_MIN_WITH_COVER     // uses r31; defines r2 and r3
23720 +#ifdef CONFIG_XEN
23721 +       movl r3=XSI_PSR_IC
23722 +       mov r14=1
23723 +       ;;
23724 +       st4 [r3]=r14
23725 +#else
23726 +       ssm psr.ic | PSR_DEFAULT_BITS
23727 +#endif
23728 +       ;;
23729 +       adds r3=8,r2            // set up second base pointer for SAVE_REST
23730 +       srlz.i                  // ensure everybody knows psr.ic is back on
23731 +       ;;
23732 +       SAVE_REST
23733 +       ;;
23734 +       alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
23735 +#ifdef CONFIG_XEN
23736 +       ;;
23737 +       br.call.sptk.many rp=xen_get_ivr
23738 +       ;;
23739 +       mov out0=r8             // pass cr.ivr as first arg
23740 +#else
23741 +       mov out0=cr.ivr         // pass cr.ivr as first arg
23742 +#endif
23743 +       add out1=16,sp          // pass pointer to pt_regs as second arg
23744 +       ;;
23745 +       srlz.d                  // make sure we see the effect of cr.ivr
23746 +       movl r14=ia64_leave_kernel
23747 +       ;;
23748 +       mov rp=r14
23749 +       br.call.sptk.many b6=ia64_handle_irq
23750 +END(interrupt)
23751 +
23752 +       .org ia64_ivt+0x3400
23753 +/////////////////////////////////////////////////////////////////////////////////////////
23754 +// 0x3400 Entry 13 (size 64 bundles) Reserved
23755 +       DBG_FAULT(13)
23756 +       FAULT(13)
23757 +
23758 +       .org ia64_ivt+0x3800
23759 +/////////////////////////////////////////////////////////////////////////////////////////
23760 +// 0x3800 Entry 14 (size 64 bundles) Reserved
23761 +       DBG_FAULT(14)
23762 +       FAULT(14)
23763 +
23764 +       /*
23765 +        * There is no particular reason for this code to be here, other than that
23766 +        * there happens to be space here that would go unused otherwise.  If this
23767 +        * fault ever gets "unreserved", simply moved the following code to a more
23768 +        * suitable spot...
23769 +        *
23770 +        * ia64_syscall_setup() is a separate subroutine so that it can
23771 +        *      allocate stacked registers so it can safely demine any
23772 +        *      potential NaT values from the input registers.
23773 +        *
23774 +        * On entry:
23775 +        *      - executing on bank 0 or bank 1 register set (doesn't matter)
23776 +        *      -  r1: stack pointer
23777 +        *      -  r2: current task pointer
23778 +        *      -  r3: preserved
23779 +        *      - r11: original contents (saved ar.pfs to be saved)
23780 +        *      - r12: original contents (sp to be saved)
23781 +        *      - r13: original contents (tp to be saved)
23782 +        *      - r15: original contents (syscall # to be saved)
23783 +        *      - r18: saved bsp (after switching to kernel stack)
23784 +        *      - r19: saved b6
23785 +        *      - r20: saved r1 (gp)
23786 +        *      - r21: saved ar.fpsr
23787 +        *      - r22: kernel's register backing store base (krbs_base)
23788 +        *      - r23: saved ar.bspstore
23789 +        *      - r24: saved ar.rnat
23790 +        *      - r25: saved ar.unat
23791 +        *      - r26: saved ar.pfs
23792 +        *      - r27: saved ar.rsc
23793 +        *      - r28: saved cr.iip
23794 +        *      - r29: saved cr.ipsr
23795 +        *      - r31: saved pr
23796 +        *      -  b0: original contents (to be saved)
23797 +        * On exit:
23798 +        *      - executing on bank 1 registers
23799 +        *      - psr.ic enabled, interrupts restored
23800 +        *      -  p10: TRUE if syscall is invoked with more than 8 out
23801 +        *              registers or r15's Nat is true
23802 +        *      -  r1: kernel's gp
23803 +        *      -  r3: preserved (same as on entry)
23804 +        *      -  r8: -EINVAL if p10 is true
23805 +        *      - r12: points to kernel stack
23806 +        *      - r13: points to current task
23807 +        *      - p15: TRUE if interrupts need to be re-enabled
23808 +        *      - ar.fpsr: set to kernel settings
23809 +        */
23810 +#ifndef CONFIG_XEN
23811 +GLOBAL_ENTRY(ia64_syscall_setup)
23812 +#if PT(B6) != 0
23813 +# error This code assumes that b6 is the first field in pt_regs.
23814 +#endif
23815 +       st8 [r1]=r19                            // save b6
23816 +       add r16=PT(CR_IPSR),r1                  // initialize first base pointer
23817 +       add r17=PT(R11),r1                      // initialize second base pointer
23818 +       ;;
23819 +       alloc r19=ar.pfs,8,0,0,0                // ensure in0-in7 are writable
23820 +       st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)    // save cr.ipsr
23821 +       tnat.nz p8,p0=in0
23822 +
23823 +       st8.spill [r17]=r11,PT(CR_IIP)-PT(R11)  // save r11
23824 +       tnat.nz p9,p0=in1
23825 +(pKStk)        mov r18=r0                              // make sure r18 isn't NaT
23826 +       ;;
23827 +
23828 +       st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS)     // save ar.pfs
23829 +       st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP)    // save cr.iip
23830 +       mov r28=b0                              // save b0 (2 cyc)
23831 +       ;;
23832 +
23833 +       st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT)    // save ar.unat
23834 +       dep r19=0,r19,38,26                     // clear all bits but 0..37 [I0]
23835 +(p8)   mov in0=-1
23836 +       ;;
23837 +
23838 +       st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS)    // store ar.pfs.pfm in cr.ifs
23839 +       extr.u r11=r19,7,7      // I0           // get sol of ar.pfs
23840 +       and r8=0x7f,r19         // A            // get sof of ar.pfs
23841 +
23842 +       st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
23843 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
23844 +(p9)   mov in1=-1
23845 +       ;;
23846 +
23847 +(pUStk) sub r18=r18,r22                                // r18=RSE.ndirty*8
23848 +       tnat.nz p10,p0=in2
23849 +       add r11=8,r11
23850 +       ;;
23851 +(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16                // skip over ar_rnat field
23852 +(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17    // skip over ar_bspstore field
23853 +       tnat.nz p11,p0=in3
23854 +       ;;
23855 +(p10)  mov in2=-1
23856 +       tnat.nz p12,p0=in4                              // [I0]
23857 +(p11)  mov in3=-1
23858 +       ;;
23859 +(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT)       // save ar.rnat
23860 +(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE)   // save ar.bspstore
23861 +       shl r18=r18,16                          // compute ar.rsc to be used for "loadrs"
23862 +       ;;
23863 +       st8 [r16]=r31,PT(LOADRS)-PT(PR)         // save predicates
23864 +       st8 [r17]=r28,PT(R1)-PT(B0)             // save b0
23865 +       tnat.nz p13,p0=in5                              // [I0]
23866 +       ;;
23867 +       st8 [r16]=r18,PT(R12)-PT(LOADRS)        // save ar.rsc value for "loadrs"
23868 +       st8.spill [r17]=r20,PT(R13)-PT(R1)      // save original r1
23869 +(p12)  mov in4=-1
23870 +       ;;
23871 +
23872 +.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12)       // save r12
23873 +.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13)           // save r13
23874 +(p13)  mov in5=-1
23875 +       ;;
23876 +       st8 [r16]=r21,PT(R8)-PT(AR_FPSR)        // save ar.fpsr
23877 +       tnat.nz p14,p0=in6
23878 +       cmp.lt p10,p9=r11,r8    // frame size can't be more than local+8
23879 +       ;;
23880 +       stf8 [r16]=f1           // ensure pt_regs.r8 != 0 (see handle_syscall_error)
23881 +(p9)   tnat.nz p10,p0=r15
23882 +       adds r12=-16,r1         // switch to kernel memory stack (with 16 bytes of scratch)
23883 +
23884 +       st8.spill [r17]=r15                     // save r15
23885 +       tnat.nz p8,p0=in7
23886 +       nop.i 0
23887 +
23888 +       mov r13=r2                              // establish `current'
23889 +       movl r1=__gp                            // establish kernel global pointer
23890 +       ;;
23891 +(p14)  mov in6=-1
23892 +(p8)   mov in7=-1
23893 +       nop.i 0
23894 +
23895 +       cmp.eq pSys,pNonSys=r0,r0               // set pSys=1, pNonSys=0
23896 +       movl r17=FPSR_DEFAULT
23897 +       ;;
23898 +       mov.m ar.fpsr=r17                       // set ar.fpsr to kernel default value
23899 +(p10)  mov r8=-EINVAL
23900 +       br.ret.sptk.many b7
23901 +END(ia64_syscall_setup)
23902 +#endif
23903 +
23904 +       .org ia64_ivt+0x3c00
23905 +/////////////////////////////////////////////////////////////////////////////////////////
23906 +// 0x3c00 Entry 15 (size 64 bundles) Reserved
23907 +       DBG_FAULT(15)
23908 +       FAULT(15)
23909 +
23910 +       /*
23911 +        * Squatting in this space ...
23912 +        *
23913 +        * This special case dispatcher for illegal operation faults allows preserved
23914 +        * registers to be modified through a callback function (asm only) that is handed
23915 +        * back from the fault handler in r8. Up to three arguments can be passed to the
23916 +        * callback function by returning an aggregate with the callback as its first
23917 +        * element, followed by the arguments.
23918 +        */
23919 +ENTRY(dispatch_illegal_op_fault)
23920 +       SAVE_MIN_WITH_COVER
23921 +       ssm psr.ic | PSR_DEFAULT_BITS
23922 +       ;;
23923 +       srlz.i          // guarantee that interruption collection is on
23924 +       ;;
23925 +(p15)  ssm psr.i       // restore psr.i
23926 +       adds r3=8,r2    // set up second base pointer for SAVE_REST
23927 +       ;;
23928 +       alloc r14=ar.pfs,0,0,1,0        // must be first in insn group
23929 +       mov out0=ar.ec
23930 +       ;;
23931 +       SAVE_REST
23932 +       ;;
23933 +       br.call.sptk.many rp=ia64_illegal_op_fault
23934 +.ret0: ;;
23935 +       alloc r14=ar.pfs,0,0,3,0        // must be first in insn group
23936 +       mov out0=r9
23937 +       mov out1=r10
23938 +       mov out2=r11
23939 +       movl r15=ia64_leave_kernel
23940 +       ;;
23941 +       mov rp=r15
23942 +       mov b6=r8
23943 +       ;;
23944 +       cmp.ne p6,p0=0,r8
23945 +(p6)   br.call.dpnt.many b6=b6         // call returns to ia64_leave_kernel
23946 +       br.sptk.many ia64_leave_kernel
23947 +END(dispatch_illegal_op_fault)
23948 +
23949 +       .org ia64_ivt+0x4000
23950 +/////////////////////////////////////////////////////////////////////////////////////////
23951 +// 0x4000 Entry 16 (size 64 bundles) Reserved
23952 +       DBG_FAULT(16)
23953 +       FAULT(16)
23954 +
23955 +       .org ia64_ivt+0x4400
23956 +/////////////////////////////////////////////////////////////////////////////////////////
23957 +// 0x4400 Entry 17 (size 64 bundles) Reserved
23958 +       DBG_FAULT(17)
23959 +       FAULT(17)
23960 +
23961 +ENTRY(non_syscall)
23962 +       SAVE_MIN_WITH_COVER
23963 +
23964 +       // There is no particular reason for this code to be here, other than that
23965 +       // there happens to be space here that would go unused otherwise.  If this
23966 +       // fault ever gets "unreserved", simply moved the following code to a more
23967 +       // suitable spot...
23968 +
23969 +       alloc r14=ar.pfs,0,0,2,0
23970 +       mov out0=cr.iim
23971 +       add out1=16,sp
23972 +       adds r3=8,r2                    // set up second base pointer for SAVE_REST
23973 +
23974 +       ssm psr.ic | PSR_DEFAULT_BITS
23975 +       ;;
23976 +       srlz.i                          // guarantee that interruption collection is on
23977 +       ;;
23978 +(p15)  ssm psr.i                       // restore psr.i
23979 +       movl r15=ia64_leave_kernel
23980 +       ;;
23981 +       SAVE_REST
23982 +       mov rp=r15
23983 +       ;;
23984 +       br.call.sptk.many b6=ia64_bad_break     // avoid WAW on CFM and ignore return addr
23985 +END(non_syscall)
23986 +
23987 +       .org ia64_ivt+0x4800
23988 +/////////////////////////////////////////////////////////////////////////////////////////
23989 +// 0x4800 Entry 18 (size 64 bundles) Reserved
23990 +       DBG_FAULT(18)
23991 +       FAULT(18)
23992 +
23993 +       /*
23994 +        * There is no particular reason for this code to be here, other than that
23995 +        * there happens to be space here that would go unused otherwise.  If this
23996 +        * fault ever gets "unreserved", simply moved the following code to a more
23997 +        * suitable spot...
23998 +        */
23999 +
24000 +ENTRY(dispatch_unaligned_handler)
24001 +       SAVE_MIN_WITH_COVER
24002 +       ;;
24003 +       alloc r14=ar.pfs,0,0,2,0                // now it's safe (must be first in insn group!)
24004 +       mov out0=cr.ifa
24005 +       adds out1=16,sp
24006 +
24007 +       ssm psr.ic | PSR_DEFAULT_BITS
24008 +       ;;
24009 +       srlz.i                                  // guarantee that interruption collection is on
24010 +       ;;
24011 +(p15)  ssm psr.i                               // restore psr.i
24012 +       adds r3=8,r2                            // set up second base pointer
24013 +       ;;
24014 +       SAVE_REST
24015 +       movl r14=ia64_leave_kernel
24016 +       ;;
24017 +       mov rp=r14
24018 +       br.sptk.many ia64_prepare_handle_unaligned
24019 +END(dispatch_unaligned_handler)
24020 +
24021 +       .org ia64_ivt+0x4c00
24022 +/////////////////////////////////////////////////////////////////////////////////////////
24023 +// 0x4c00 Entry 19 (size 64 bundles) Reserved
24024 +       DBG_FAULT(19)
24025 +       FAULT(19)
24026 +
24027 +       /*
24028 +        * There is no particular reason for this code to be here, other than that
24029 +        * there happens to be space here that would go unused otherwise.  If this
24030 +        * fault ever gets "unreserved", simply moved the following code to a more
24031 +        * suitable spot...
24032 +        */
24033 +
24034 +ENTRY(dispatch_to_fault_handler)
24035 +       /*
24036 +        * Input:
24037 +        *      psr.ic: off
24038 +        *      r19:    fault vector number (e.g., 24 for General Exception)
24039 +        *      r31:    contains saved predicates (pr)
24040 +        */
24041 +       SAVE_MIN_WITH_COVER_R19
24042 +       alloc r14=ar.pfs,0,0,5,0
24043 +       mov out0=r15
24044 +#ifdef CONFIG_XEN
24045 +       movl out1=XSI_ISR
24046 +       ;;
24047 +       adds out2=XSI_IFA-XSI_ISR,out1
24048 +       adds out3=XSI_IIM-XSI_ISR,out1
24049 +       adds out4=XSI_ITIR-XSI_ISR,out1
24050 +       ;;
24051 +       ld8 out1=[out1]
24052 +       ld8 out2=[out2]
24053 +       ld8 out3=[out4]
24054 +       ld8 out4=[out4]
24055 +       ;;
24056 +#else
24057 +       mov out1=cr.isr
24058 +       mov out2=cr.ifa
24059 +       mov out3=cr.iim
24060 +       mov out4=cr.itir
24061 +       ;;
24062 +#endif
24063 +       ssm psr.ic | PSR_DEFAULT_BITS
24064 +       ;;
24065 +       srlz.i                                  // guarantee that interruption collection is on
24066 +       ;;
24067 +(p15)  ssm psr.i                               // restore psr.i
24068 +       adds r3=8,r2                            // set up second base pointer for SAVE_REST
24069 +       ;;
24070 +       SAVE_REST
24071 +       movl r14=ia64_leave_kernel
24072 +       ;;
24073 +       mov rp=r14
24074 +       br.call.sptk.many b6=ia64_fault
24075 +END(dispatch_to_fault_handler)
24076 +
24077 +//
24078 +// --- End of long entries, Beginning of short entries
24079 +//
24080 +
24081 +       .org ia64_ivt+0x5000
24082 +/////////////////////////////////////////////////////////////////////////////////////////
24083 +// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
24084 +ENTRY(page_not_present)
24085 +       DBG_FAULT(20)
24086 +       mov r16=cr.ifa
24087 +       rsm psr.dt
24088 +       /*
24089 +        * The Linux page fault handler doesn't expect non-present pages to be in
24090 +        * the TLB.  Flush the existing entry now, so we meet that expectation.
24091 +        */
24092 +       mov r17=PAGE_SHIFT<<2
24093 +       ;;
24094 +       ptc.l r16,r17
24095 +       ;;
24096 +       mov r31=pr
24097 +       srlz.d
24098 +       br.sptk.many page_fault
24099 +END(page_not_present)
24100 +
24101 +       .org ia64_ivt+0x5100
24102 +/////////////////////////////////////////////////////////////////////////////////////////
24103 +// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
24104 +ENTRY(key_permission)
24105 +       DBG_FAULT(21)
24106 +       mov r16=cr.ifa
24107 +       rsm psr.dt
24108 +       mov r31=pr
24109 +       ;;
24110 +       srlz.d
24111 +       br.sptk.many page_fault
24112 +END(key_permission)
24113 +
24114 +       .org ia64_ivt+0x5200
24115 +/////////////////////////////////////////////////////////////////////////////////////////
24116 +// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
24117 +ENTRY(iaccess_rights)
24118 +       DBG_FAULT(22)
24119 +       mov r16=cr.ifa
24120 +       rsm psr.dt
24121 +       mov r31=pr
24122 +       ;;
24123 +       srlz.d
24124 +       br.sptk.many page_fault
24125 +END(iaccess_rights)
24126 +
24127 +       .org ia64_ivt+0x5300
24128 +/////////////////////////////////////////////////////////////////////////////////////////
24129 +// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
24130 +ENTRY(daccess_rights)
24131 +       DBG_FAULT(23)
24132 +#ifdef CONFIG_XEN
24133 +       movl r16=XSI_IFA
24134 +       ;;
24135 +       ld8 r16=[r16]
24136 +       ;;
24137 +       XEN_HYPER_RSM_PSR_DT;
24138 +#else
24139 +       mov r16=cr.ifa
24140 +       rsm psr.dt
24141 +#endif
24142 +       mov r31=pr
24143 +       ;;
24144 +       srlz.d
24145 +       br.sptk.many page_fault
24146 +END(daccess_rights)
24147 +
24148 +       .org ia64_ivt+0x5400
24149 +/////////////////////////////////////////////////////////////////////////////////////////
24150 +// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
24151 +ENTRY(general_exception)
24152 +       DBG_FAULT(24)
24153 +       mov r16=cr.isr
24154 +       mov r31=pr
24155 +       ;;
24156 +       cmp4.eq p6,p0=0,r16
24157 +(p6)   br.sptk.many dispatch_illegal_op_fault
24158 +       ;;
24159 +       mov r19=24              // fault number
24160 +       br.sptk.many dispatch_to_fault_handler
24161 +END(general_exception)
24162 +
24163 +       .org ia64_ivt+0x5500
24164 +/////////////////////////////////////////////////////////////////////////////////////////
24165 +// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
24166 +ENTRY(disabled_fp_reg)
24167 +       DBG_FAULT(25)
24168 +       rsm psr.dfh             // ensure we can access fph
24169 +       ;;
24170 +       srlz.d
24171 +       mov r31=pr
24172 +       mov r19=25
24173 +       br.sptk.many dispatch_to_fault_handler
24174 +END(disabled_fp_reg)
24175 +
24176 +       .org ia64_ivt+0x5600
24177 +/////////////////////////////////////////////////////////////////////////////////////////
24178 +// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
24179 +ENTRY(nat_consumption)
24180 +       DBG_FAULT(26)
24181 +       FAULT(26)
24182 +END(nat_consumption)
24183 +
24184 +       .org ia64_ivt+0x5700
24185 +/////////////////////////////////////////////////////////////////////////////////////////
24186 +// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
24187 +ENTRY(speculation_vector)
24188 +       DBG_FAULT(27)
24189 +       /*
24190 +        * A [f]chk.[as] instruction needs to take the branch to the recovery code but
24191 +        * this part of the architecture is not implemented in hardware on some CPUs, such
24192 +        * as Itanium.  Thus, in general we need to emulate the behavior.  IIM contains
24193 +        * the relative target (not yet sign extended).  So after sign extending it we
24194 +        * simply add it to IIP.  We also need to reset the EI field of the IPSR to zero,
24195 +        * i.e., the slot to restart into.
24196 +        *
24197 +        * cr.imm contains zero_ext(imm21)
24198 +        */
24199 +       mov r18=cr.iim
24200 +       ;;
24201 +       mov r17=cr.iip
24202 +       shl r18=r18,43                  // put sign bit in position (43=64-21)
24203 +       ;;
24204 +
24205 +       mov r16=cr.ipsr
24206 +       shr r18=r18,39                  // sign extend (39=43-4)
24207 +       ;;
24208 +
24209 +       add r17=r17,r18                 // now add the offset
24210 +       ;;
24211 +       mov cr.iip=r17
24212 +       dep r16=0,r16,41,2              // clear EI
24213 +       ;;
24214 +
24215 +       mov cr.ipsr=r16
24216 +       ;;
24217 +
24218 +#ifdef CONFIG_XEN
24219 +       XEN_HYPER_RFI;
24220 +#else
24221 +       rfi
24222 +#endif
24223 +END(speculation_vector)
24224 +
24225 +       .org ia64_ivt+0x5800
24226 +/////////////////////////////////////////////////////////////////////////////////////////
24227 +// 0x5800 Entry 28 (size 16 bundles) Reserved
24228 +       DBG_FAULT(28)
24229 +       FAULT(28)
24230 +
24231 +       .org ia64_ivt+0x5900
24232 +/////////////////////////////////////////////////////////////////////////////////////////
24233 +// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
24234 +ENTRY(debug_vector)
24235 +       DBG_FAULT(29)
24236 +       FAULT(29)
24237 +END(debug_vector)
24238 +
24239 +       .org ia64_ivt+0x5a00
24240 +/////////////////////////////////////////////////////////////////////////////////////////
24241 +// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
24242 +ENTRY(unaligned_access)
24243 +       DBG_FAULT(30)
24244 +       mov r16=cr.ipsr
24245 +       mov r31=pr              // prepare to save predicates
24246 +       ;;
24247 +       br.sptk.many dispatch_unaligned_handler
24248 +END(unaligned_access)
24249 +
24250 +       .org ia64_ivt+0x5b00
24251 +/////////////////////////////////////////////////////////////////////////////////////////
24252 +// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
24253 +ENTRY(unsupported_data_reference)
24254 +       DBG_FAULT(31)
24255 +       FAULT(31)
24256 +END(unsupported_data_reference)
24257 +
24258 +       .org ia64_ivt+0x5c00
24259 +/////////////////////////////////////////////////////////////////////////////////////////
24260 +// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
24261 +ENTRY(floating_point_fault)
24262 +       DBG_FAULT(32)
24263 +       FAULT(32)
24264 +END(floating_point_fault)
24265 +
24266 +       .org ia64_ivt+0x5d00
24267 +/////////////////////////////////////////////////////////////////////////////////////////
24268 +// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
24269 +ENTRY(floating_point_trap)
24270 +       DBG_FAULT(33)
24271 +       FAULT(33)
24272 +END(floating_point_trap)
24273 +
24274 +       .org ia64_ivt+0x5e00
24275 +/////////////////////////////////////////////////////////////////////////////////////////
24276 +// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
24277 +ENTRY(lower_privilege_trap)
24278 +       DBG_FAULT(34)
24279 +       FAULT(34)
24280 +END(lower_privilege_trap)
24281 +
24282 +       .org ia64_ivt+0x5f00
24283 +/////////////////////////////////////////////////////////////////////////////////////////
24284 +// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
24285 +ENTRY(taken_branch_trap)
24286 +       DBG_FAULT(35)
24287 +       FAULT(35)
24288 +END(taken_branch_trap)
24289 +
24290 +       .org ia64_ivt+0x6000
24291 +/////////////////////////////////////////////////////////////////////////////////////////
24292 +// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
24293 +ENTRY(single_step_trap)
24294 +       DBG_FAULT(36)
24295 +       FAULT(36)
24296 +END(single_step_trap)
24297 +
24298 +       .org ia64_ivt+0x6100
24299 +/////////////////////////////////////////////////////////////////////////////////////////
24300 +// 0x6100 Entry 37 (size 16 bundles) Reserved
24301 +       DBG_FAULT(37)
24302 +       FAULT(37)
24303 +
24304 +       .org ia64_ivt+0x6200
24305 +/////////////////////////////////////////////////////////////////////////////////////////
24306 +// 0x6200 Entry 38 (size 16 bundles) Reserved
24307 +       DBG_FAULT(38)
24308 +       FAULT(38)
24309 +
24310 +       .org ia64_ivt+0x6300
24311 +/////////////////////////////////////////////////////////////////////////////////////////
24312 +// 0x6300 Entry 39 (size 16 bundles) Reserved
24313 +       DBG_FAULT(39)
24314 +       FAULT(39)
24315 +
24316 +       .org ia64_ivt+0x6400
24317 +/////////////////////////////////////////////////////////////////////////////////////////
24318 +// 0x6400 Entry 40 (size 16 bundles) Reserved
24319 +       DBG_FAULT(40)
24320 +       FAULT(40)
24321 +
24322 +       .org ia64_ivt+0x6500
24323 +/////////////////////////////////////////////////////////////////////////////////////////
24324 +// 0x6500 Entry 41 (size 16 bundles) Reserved
24325 +       DBG_FAULT(41)
24326 +       FAULT(41)
24327 +
24328 +       .org ia64_ivt+0x6600
24329 +/////////////////////////////////////////////////////////////////////////////////////////
24330 +// 0x6600 Entry 42 (size 16 bundles) Reserved
24331 +       DBG_FAULT(42)
24332 +       FAULT(42)
24333 +
24334 +       .org ia64_ivt+0x6700
24335 +/////////////////////////////////////////////////////////////////////////////////////////
24336 +// 0x6700 Entry 43 (size 16 bundles) Reserved
24337 +       DBG_FAULT(43)
24338 +       FAULT(43)
24339 +
24340 +       .org ia64_ivt+0x6800
24341 +/////////////////////////////////////////////////////////////////////////////////////////
24342 +// 0x6800 Entry 44 (size 16 bundles) Reserved
24343 +       DBG_FAULT(44)
24344 +       FAULT(44)
24345 +
24346 +       .org ia64_ivt+0x6900
24347 +/////////////////////////////////////////////////////////////////////////////////////////
24348 +// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
24349 +ENTRY(ia32_exception)
24350 +       DBG_FAULT(45)
24351 +       FAULT(45)
24352 +END(ia32_exception)
24353 +
24354 +       .org ia64_ivt+0x6a00
24355 +/////////////////////////////////////////////////////////////////////////////////////////
24356 +// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
24357 +ENTRY(ia32_intercept)
24358 +       DBG_FAULT(46)
24359 +#ifdef CONFIG_IA32_SUPPORT
24360 +       mov r31=pr
24361 +       mov r16=cr.isr
24362 +       ;;
24363 +       extr.u r17=r16,16,8     // get ISR.code
24364 +       mov r18=ar.eflag
24365 +       mov r19=cr.iim          // old eflag value
24366 +       ;;
24367 +       cmp.ne p6,p0=2,r17
24368 +(p6)   br.cond.spnt 1f         // not a system flag fault
24369 +       xor r16=r18,r19
24370 +       ;;
24371 +       extr.u r17=r16,18,1     // get the eflags.ac bit
24372 +       ;;
24373 +       cmp.eq p6,p0=0,r17
24374 +(p6)   br.cond.spnt 1f         // eflags.ac bit didn't change
24375 +       ;;
24376 +       mov pr=r31,-1           // restore predicate registers
24377 +#ifdef CONFIG_XEN
24378 +       XEN_HYPER_RFI;
24379 +#else
24380 +       rfi
24381 +#endif
24382 +
24383 +1:
24384 +#endif // CONFIG_IA32_SUPPORT
24385 +       FAULT(46)
24386 +END(ia32_intercept)
24387 +
24388 +       .org ia64_ivt+0x6b00
24389 +/////////////////////////////////////////////////////////////////////////////////////////
24390 +// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
24391 +ENTRY(ia32_interrupt)
24392 +       DBG_FAULT(47)
24393 +#ifdef CONFIG_IA32_SUPPORT
24394 +       mov r31=pr
24395 +       br.sptk.many dispatch_to_ia32_handler
24396 +#else
24397 +       FAULT(47)
24398 +#endif
24399 +END(ia32_interrupt)
24400 +
24401 +       .org ia64_ivt+0x6c00
24402 +/////////////////////////////////////////////////////////////////////////////////////////
24403 +// 0x6c00 Entry 48 (size 16 bundles) Reserved
24404 +       DBG_FAULT(48)
24405 +       FAULT(48)
24406 +
24407 +       .org ia64_ivt+0x6d00
24408 +/////////////////////////////////////////////////////////////////////////////////////////
24409 +// 0x6d00 Entry 49 (size 16 bundles) Reserved
24410 +       DBG_FAULT(49)
24411 +       FAULT(49)
24412 +
24413 +       .org ia64_ivt+0x6e00
24414 +/////////////////////////////////////////////////////////////////////////////////////////
24415 +// 0x6e00 Entry 50 (size 16 bundles) Reserved
24416 +       DBG_FAULT(50)
24417 +       FAULT(50)
24418 +
24419 +       .org ia64_ivt+0x6f00
24420 +/////////////////////////////////////////////////////////////////////////////////////////
24421 +// 0x6f00 Entry 51 (size 16 bundles) Reserved
24422 +       DBG_FAULT(51)
24423 +       FAULT(51)
24424 +
24425 +       .org ia64_ivt+0x7000
24426 +/////////////////////////////////////////////////////////////////////////////////////////
24427 +// 0x7000 Entry 52 (size 16 bundles) Reserved
24428 +       DBG_FAULT(52)
24429 +       FAULT(52)
24430 +
24431 +       .org ia64_ivt+0x7100
24432 +/////////////////////////////////////////////////////////////////////////////////////////
24433 +// 0x7100 Entry 53 (size 16 bundles) Reserved
24434 +       DBG_FAULT(53)
24435 +       FAULT(53)
24436 +
24437 +       .org ia64_ivt+0x7200
24438 +/////////////////////////////////////////////////////////////////////////////////////////
24439 +// 0x7200 Entry 54 (size 16 bundles) Reserved
24440 +       DBG_FAULT(54)
24441 +       FAULT(54)
24442 +
24443 +       .org ia64_ivt+0x7300
24444 +/////////////////////////////////////////////////////////////////////////////////////////
24445 +// 0x7300 Entry 55 (size 16 bundles) Reserved
24446 +       DBG_FAULT(55)
24447 +       FAULT(55)
24448 +
24449 +       .org ia64_ivt+0x7400
24450 +/////////////////////////////////////////////////////////////////////////////////////////
24451 +// 0x7400 Entry 56 (size 16 bundles) Reserved
24452 +       DBG_FAULT(56)
24453 +       FAULT(56)
24454 +
24455 +       .org ia64_ivt+0x7500
24456 +/////////////////////////////////////////////////////////////////////////////////////////
24457 +// 0x7500 Entry 57 (size 16 bundles) Reserved
24458 +       DBG_FAULT(57)
24459 +       FAULT(57)
24460 +
24461 +       .org ia64_ivt+0x7600
24462 +/////////////////////////////////////////////////////////////////////////////////////////
24463 +// 0x7600 Entry 58 (size 16 bundles) Reserved
24464 +       DBG_FAULT(58)
24465 +       FAULT(58)
24466 +
24467 +       .org ia64_ivt+0x7700
24468 +/////////////////////////////////////////////////////////////////////////////////////////
24469 +// 0x7700 Entry 59 (size 16 bundles) Reserved
24470 +       DBG_FAULT(59)
24471 +       FAULT(59)
24472 +
24473 +       .org ia64_ivt+0x7800
24474 +/////////////////////////////////////////////////////////////////////////////////////////
24475 +// 0x7800 Entry 60 (size 16 bundles) Reserved
24476 +       DBG_FAULT(60)
24477 +       FAULT(60)
24478 +
24479 +       .org ia64_ivt+0x7900
24480 +/////////////////////////////////////////////////////////////////////////////////////////
24481 +// 0x7900 Entry 61 (size 16 bundles) Reserved
24482 +       DBG_FAULT(61)
24483 +       FAULT(61)
24484 +
24485 +       .org ia64_ivt+0x7a00
24486 +/////////////////////////////////////////////////////////////////////////////////////////
24487 +// 0x7a00 Entry 62 (size 16 bundles) Reserved
24488 +       DBG_FAULT(62)
24489 +       FAULT(62)
24490 +
24491 +       .org ia64_ivt+0x7b00
24492 +/////////////////////////////////////////////////////////////////////////////////////////
24493 +// 0x7b00 Entry 63 (size 16 bundles) Reserved
24494 +       DBG_FAULT(63)
24495 +       FAULT(63)
24496 +
24497 +       .org ia64_ivt+0x7c00
24498 +/////////////////////////////////////////////////////////////////////////////////////////
24499 +// 0x7c00 Entry 64 (size 16 bundles) Reserved
24500 +       DBG_FAULT(64)
24501 +       FAULT(64)
24502 +
24503 +       .org ia64_ivt+0x7d00
24504 +/////////////////////////////////////////////////////////////////////////////////////////
24505 +// 0x7d00 Entry 65 (size 16 bundles) Reserved
24506 +       DBG_FAULT(65)
24507 +       FAULT(65)
24508 +
24509 +       .org ia64_ivt+0x7e00
24510 +/////////////////////////////////////////////////////////////////////////////////////////
24511 +// 0x7e00 Entry 66 (size 16 bundles) Reserved
24512 +       DBG_FAULT(66)
24513 +       FAULT(66)
24514 +
24515 +#ifdef CONFIG_XEN
24516 +       /*
24517 +        * There is no particular reason for this code to be here, other than that
24518 +        * there happens to be space here that would go unused otherwise.  If this
24519 +        * fault ever gets "unreserved", simply moved the following code to a more
24520 +        * suitable spot...
24521 +        */
24522 +
24523 +GLOBAL_ENTRY(xen_bsw1)
24524 +       /* FIXME: THIS CODE IS NOT NaT SAFE! */
24525 +       movl r30=XSI_BANKNUM;
24526 +       mov r31=1;;
24527 +       st4 [r30]=r31;
24528 +       movl r30=XSI_BANK1_R16;
24529 +       movl r31=XSI_BANK1_R16+8;;
24530 +       ld8 r16=[r30],16; ld8 r17=[r31],16;;
24531 +       ld8 r18=[r30],16; ld8 r19=[r31],16;;
24532 +       ld8 r20=[r30],16; ld8 r21=[r31],16;;
24533 +       ld8 r22=[r30],16; ld8 r23=[r31],16;;
24534 +       ld8 r24=[r30],16; ld8 r25=[r31],16;;
24535 +       ld8 r26=[r30],16; ld8 r27=[r31],16;;
24536 +       ld8 r28=[r30],16; ld8 r29=[r31],16;;
24537 +       ld8 r30=[r30]; ld8 r31=[r31];;
24538 +       br.ret.sptk.many b0
24539 +#endif
24540 +
24541 +       .org ia64_ivt+0x7f00
24542 +/////////////////////////////////////////////////////////////////////////////////////////
24543 +// 0x7f00 Entry 67 (size 16 bundles) Reserved
24544 +       DBG_FAULT(67)
24545 +       FAULT(67)
24546 +
24547 +#ifdef CONFIG_IA32_SUPPORT
24548 +
24549 +       /*
24550 +        * There is no particular reason for this code to be here, other than that
24551 +        * there happens to be space here that would go unused otherwise.  If this
24552 +        * fault ever gets "unreserved", simply moved the following code to a more
24553 +        * suitable spot...
24554 +        */
24555 +
24556 +       // IA32 interrupt entry point
24557 +
24558 +ENTRY(dispatch_to_ia32_handler)
24559 +       SAVE_MIN
24560 +       ;;
24561 +       mov r14=cr.isr
24562 +       ssm psr.ic | PSR_DEFAULT_BITS
24563 +       ;;
24564 +       srlz.i                                  // guarantee that interruption collection is on
24565 +       ;;
24566 +(p15)  ssm psr.i
24567 +       adds r3=8,r2            // Base pointer for SAVE_REST
24568 +       ;;
24569 +       SAVE_REST
24570 +       ;;
24571 +       mov r15=0x80
24572 +       shr r14=r14,16          // Get interrupt number
24573 +       ;;
24574 +       cmp.ne p6,p0=r14,r15
24575 +(p6)   br.call.dpnt.many b6=non_ia32_syscall
24576 +
24577 +       adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
24578 +       adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
24579 +       ;;
24580 +       cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
24581 +       ld8 r8=[r14]            // get r8
24582 +       ;;
24583 +       st8 [r15]=r8            // save original EAX in r1 (IA32 procs don't use the GP)
24584 +       ;;
24585 +       alloc r15=ar.pfs,0,0,6,0        // must first in an insn group
24586 +       ;;
24587 +       ld4 r8=[r14],8          // r8 == eax (syscall number)
24588 +       mov r15=IA32_NR_syscalls
24589 +       ;;
24590 +       cmp.ltu.unc p6,p7=r8,r15
24591 +       ld4 out1=[r14],8        // r9 == ecx
24592 +       ;;
24593 +       ld4 out2=[r14],8        // r10 == edx
24594 +       ;;
24595 +       ld4 out0=[r14]          // r11 == ebx
24596 +       adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
24597 +       ;;
24598 +       ld4 out5=[r14],PT(R14)-PT(R13)  // r13 == ebp
24599 +       ;;
24600 +       ld4 out3=[r14],PT(R15)-PT(R14)  // r14 == esi
24601 +       adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
24602 +       ;;
24603 +       ld4 out4=[r14]          // r15 == edi
24604 +       movl r16=ia32_syscall_table
24605 +       ;;
24606 +(p6)   shladd r16=r8,3,r16     // force ni_syscall if not valid syscall number
24607 +       ld4 r2=[r2]             // r2 = current_thread_info()->flags
24608 +       ;;
24609 +       ld8 r16=[r16]
24610 +       and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
24611 +       ;;
24612 +       mov b6=r16
24613 +       movl r15=ia32_ret_from_syscall
24614 +       cmp.eq p8,p0=r2,r0
24615 +       ;;
24616 +       mov rp=r15
24617 +(p8)   br.call.sptk.many b6=b6
24618 +       br.cond.sptk ia32_trace_syscall
24619 +
24620 +non_ia32_syscall:
24621 +       alloc r15=ar.pfs,0,0,2,0
24622 +       mov out0=r14                            // interrupt #
24623 +       add out1=16,sp                          // pointer to pt_regs
24624 +       ;;                      // avoid WAW on CFM
24625 +       br.call.sptk.many rp=ia32_bad_interrupt
24626 +.ret1: movl r15=ia64_leave_kernel
24627 +       ;;
24628 +       mov rp=r15
24629 +       br.ret.sptk.many rp
24630 +END(dispatch_to_ia32_handler)
24631 +
24632 +#endif /* CONFIG_IA32_SUPPORT */
24633 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xen_ksyms.c tmp-linux-2.6-xen.patch/arch/ia64/xen/xen_ksyms.c
24634 --- ref-linux-2.6.16.9/arch/ia64/xen/xen_ksyms.c        1970-01-01 01:00:00.000000000 +0100
24635 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xen_ksyms.c   2006-04-10 00:05:52.000000000 +0200
24636 @@ -0,0 +1,12 @@
24637 +/*
24638 + * Architecture-specific kernel symbols
24639 + *
24640 + * Don't put any exports here unless it's defined in an assembler file.
24641 + * All other exports should be put directly after the definition.
24642 + */
24643 +
24644 +#include <linux/config.h>
24645 +#include <linux/module.h>
24646 +
24647 +extern int is_running_on_xen(void);
24648 +EXPORT_SYMBOL(is_running_on_xen);
24649 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xenminstate.h tmp-linux-2.6-xen.patch/arch/ia64/xen/xenminstate.h
24650 --- ref-linux-2.6.16.9/arch/ia64/xen/xenminstate.h      1970-01-01 01:00:00.000000000 +0100
24651 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xenminstate.h 2006-04-10 00:05:52.000000000 +0200
24652 @@ -0,0 +1,367 @@
24653 +#include <linux/config.h>
24654 +
24655 +#include <asm/cache.h>
24656 +
24657 +#ifdef CONFIG_XEN
24658 +#include "../kernel/entry.h"
24659 +#else
24660 +#include "entry.h"
24661 +#endif
24662 +
24663 +/*
24664 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
24665 + * on interrupts.
24666 + *
24667 + *  On entry:
24668 + *     r1:     pointer to current task (ar.k6)
24669 + */
24670 +#define MINSTATE_START_SAVE_MIN_VIRT                                                           \
24671 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
24672 +       ;;                                                                                      \
24673 +(pUStk)        mov.m r24=ar.rnat;                                                                      \
24674 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base of RBS */               \
24675 +(pKStk) mov r1=sp;                                     /* get sp  */                           \
24676 +       ;;                                                                                      \
24677 +(pUStk) lfetch.fault.excl.nt1 [r22];                                                           \
24678 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
24679 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
24680 +       ;;                                                                                      \
24681 +(pUStk)        mov ar.bspstore=r22;                            /* switch to kernel RBS */              \
24682 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, use sp (r12) */   \
24683 +       ;;                                                                                      \
24684 +(pUStk)        mov r18=ar.bsp;                                                                         \
24685 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
24686 +
24687 +#define MINSTATE_END_SAVE_MIN_VIRT                                                             \
24688 +       bsw.1;                  /* switch back to bank 1 (must be last in insn group) */        \
24689 +       ;;
24690 +
24691 +/*
24692 + * For mca_asm.S we want to access the stack physically since the state is saved before we
24693 + * go virtual and don't want to destroy the iip or ipsr.
24694 + */
24695 +#define MINSTATE_START_SAVE_MIN_PHYS                                                           \
24696 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                                         \
24697 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                                   \
24698 +(pKStk) ld8 r3 = [r3];;                                                                                \
24699 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                                            \
24700 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                                          \
24701 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
24702 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of register backing store */    \
24703 +       ;;                                                                                      \
24704 +(pUStk)        mov r24=ar.rnat;                                                                        \
24705 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
24706 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
24707 +(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel virtual addr of RBS */        \
24708 +       ;;                                                                                      \
24709 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp (r12) */           \
24710 +(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS */                      \
24711 +       ;;                                                                                      \
24712 +(pUStk)        mov r18=ar.bsp;                                                                         \
24713 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
24714 +
24715 +#define MINSTATE_END_SAVE_MIN_PHYS                                                             \
24716 +       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */                  \
24717 +       ;;
24718 +
24719 +#ifdef MINSTATE_VIRT
24720 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT)
24721 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
24722 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
24723 +#endif
24724 +
24725 +#ifdef MINSTATE_PHYS
24726 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
24727 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
24728 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
24729 +#endif
24730 +
24731 +/*
24732 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
24733 + * the minimum state necessary that allows us to turn psr.ic back
24734 + * on.
24735 + *
24736 + * Assumed state upon entry:
24737 + *     psr.ic: off
24738 + *     r31:    contains saved predicates (pr)
24739 + *
24740 + * Upon exit, the state is as follows:
24741 + *     psr.ic: off
24742 + *      r2 = points to &pt_regs.r16
24743 + *      r8 = contents of ar.ccv
24744 + *      r9 = contents of ar.csd
24745 + *     r10 = contents of ar.ssd
24746 + *     r11 = FPSR_DEFAULT
24747 + *     r12 = kernel sp (kernel virtual address)
24748 + *     r13 = points to current task_struct (kernel virtual address)
24749 + *     p15 = TRUE if psr.i is set in cr.ipsr
24750 + *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
24751 + *             preserved
24752 + * CONFIG_XEN note: p6/p7 are not preserved
24753 + *
24754 + * Note that psr.ic is NOT turned on by this macro.  This is so that
24755 + * we can pass interruption state as arguments to a handler.
24756 + */
24757 +#ifdef CONFIG_XEN
24758 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
24759 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
24760 +       mov r27=ar.rsc;                 /* M */                                                 \
24761 +       mov r20=r1;                     /* A */                                                 \
24762 +       mov r25=ar.unat;                /* M */                                                 \
24763 +       /* mov r29=cr.ipsr;             /* M */                                                 \
24764 +       movl r29=XSI_IPSR;;                                                                     \
24765 +       ld8 r29=[r29];;                                                                         \
24766 +       mov r26=ar.pfs;                 /* I */                                                 \
24767 +       /* mov r28=cr.iip;              /* M */                                                 \
24768 +       movl r28=XSI_IIP;;                                                                      \
24769 +       ld8 r28=[r28];;                                                                         \
24770 +       mov r21=ar.fpsr;                /* M */                                                 \
24771 +       COVER;                  /* B;; (or nothing) */                                  \
24772 +       ;;                                                                                      \
24773 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
24774 +       ;;                                                                                      \
24775 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
24776 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
24777 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
24778 +       /* switch from user to kernel RBS: */                                                   \
24779 +       ;;                                                                                      \
24780 +       invala;                         /* M */                                                 \
24781 +       /* SAVE_IFS; /* see xen special handling below */                                               \
24782 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
24783 +       ;;                                                                                      \
24784 +       MINSTATE_START_SAVE_MIN                                                                 \
24785 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
24786 +       adds r16=PT(CR_IPSR),r1;                                                                \
24787 +       ;;                                                                                      \
24788 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
24789 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
24790 +       ;;                                                                                      \
24791 +       lfetch.fault.excl.nt1 [r17];                                                            \
24792 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
24793 +       mov r29=b0                                                                              \
24794 +       ;;                                                                                      \
24795 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
24796 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
24797 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
24798 +       ;;                                                                                      \
24799 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
24800 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
24801 +        ;;                                                                                     \
24802 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
24803 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
24804 +        ;;                                                                                     \
24805 +       /* xen special handling for possibly lazy cover */                                      \
24806 +       movl r8=XSI_INCOMPL_REGFR;                                                              \
24807 +       ;;                                                                                      \
24808 +       ld4 r30=[r8];                                                                           \
24809 +       ;;                                                                                      \
24810 +       cmp.eq  p6,p7=r30,r0;                                                                   \
24811 +       ;; /* not sure if this stop bit is necessary */                                         \
24812 +(p6)   adds r8=XSI_PRECOVER_IFS-XSI_INCOMPL_REGFR,r8;                                          \
24813 +(p7)   adds r8=XSI_IFS-XSI_INCOMPL_REGFR,r8;                                                   \
24814 +       ;;                                                                                      \
24815 +       ld8 r30=[r8];                                                                           \
24816 +       ;;                                                                                      \
24817 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
24818 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
24819 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
24820 +       mov r8=ar.ccv;                                                                          \
24821 +       mov r9=ar.csd;                                                                          \
24822 +       mov r10=ar.ssd;                                                                         \
24823 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
24824 +       ;;                                                                                      \
24825 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
24826 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
24827 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
24828 +       ;;                                                                                      \
24829 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
24830 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
24831 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
24832 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
24833 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
24834 +       st8 [r17]=r31,16;       /* save predicates */                                           \
24835 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
24836 +       ;;                                                                                      \
24837 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
24838 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
24839 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
24840 +       ;;                                                                                      \
24841 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
24842 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
24843 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
24844 +       ;;                                                                                      \
24845 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
24846 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
24847 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
24848 +       ;;                                                                                      \
24849 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
24850 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
24851 +       ;;                                                                                      \
24852 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
24853 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
24854 +       ;;                                                                                      \
24855 +       EXTRA;                                                                                  \
24856 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;                                        \
24857 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
24858 +       ;;                                                                                      \
24859 +       movl r1=__gp;           /* establish kernel global pointer */                           \
24860 +       ;;                                                                                      \
24861 +       /* MINSTATE_END_SAVE_MIN */
24862 +#else
24863 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
24864 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
24865 +       mov r27=ar.rsc;                 /* M */                                                 \
24866 +       mov r20=r1;                     /* A */                                                 \
24867 +       mov r25=ar.unat;                /* M */                                                 \
24868 +       mov r29=cr.ipsr;                /* M */                                                 \
24869 +       mov r26=ar.pfs;                 /* I */                                                 \
24870 +       mov r28=cr.iip;                 /* M */                                                 \
24871 +       mov r21=ar.fpsr;                /* M */                                                 \
24872 +       COVER;                          /* B;; (or nothing) */                                  \
24873 +       ;;                                                                                      \
24874 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
24875 +       ;;                                                                                      \
24876 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
24877 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
24878 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
24879 +       /* switch from user to kernel RBS: */                                                   \
24880 +       ;;                                                                                      \
24881 +       invala;                         /* M */                                                 \
24882 +       SAVE_IFS;                                                                               \
24883 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
24884 +       ;;                                                                                      \
24885 +       MINSTATE_START_SAVE_MIN                                                                 \
24886 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
24887 +       adds r16=PT(CR_IPSR),r1;                                                                \
24888 +       ;;                                                                                      \
24889 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
24890 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
24891 +       ;;                                                                                      \
24892 +       lfetch.fault.excl.nt1 [r17];                                                            \
24893 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
24894 +       mov r29=b0                                                                              \
24895 +       ;;                                                                                      \
24896 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
24897 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
24898 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
24899 +       ;;                                                                                      \
24900 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
24901 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
24902 +        ;;                                                                                     \
24903 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
24904 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
24905 +        ;;                                                                                     \
24906 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
24907 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
24908 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
24909 +       mov r8=ar.ccv;                                                                          \
24910 +       mov r9=ar.csd;                                                                          \
24911 +       mov r10=ar.ssd;                                                                         \
24912 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
24913 +       ;;                                                                                      \
24914 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
24915 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
24916 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
24917 +       ;;                                                                                      \
24918 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
24919 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
24920 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
24921 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
24922 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
24923 +       st8 [r17]=r31,16;       /* save predicates */                                           \
24924 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
24925 +       ;;                                                                                      \
24926 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
24927 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
24928 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
24929 +       ;;                                                                                      \
24930 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
24931 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
24932 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
24933 +       ;;                                                                                      \
24934 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
24935 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
24936 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
24937 +       ;;                                                                                      \
24938 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
24939 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
24940 +       ;;                                                                                      \
24941 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
24942 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
24943 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
24944 +       ;;                                                                                      \
24945 +       EXTRA;                                                                                  \
24946 +       movl r1=__gp;           /* establish kernel global pointer */                           \
24947 +       ;;                                                                                      \
24948 +       MINSTATE_END_SAVE_MIN
24949 +#endif
24950 +
24951 +/*
24952 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
24953 + *
24954 + * Assumed state upon entry:
24955 + *     psr.ic: on
24956 + *     r2:     points to &pt_regs.r16
24957 + *     r3:     points to &pt_regs.r17
24958 + *     r8:     contents of ar.ccv
24959 + *     r9:     contents of ar.csd
24960 + *     r10:    contents of ar.ssd
24961 + *     r11:    FPSR_DEFAULT
24962 + *
24963 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
24964 + */
24965 +#define SAVE_REST                              \
24966 +.mem.offset 0,0; st8.spill [r2]=r16,16;                \
24967 +.mem.offset 8,0; st8.spill [r3]=r17,16;                \
24968 +       ;;                                      \
24969 +.mem.offset 0,0; st8.spill [r2]=r18,16;                \
24970 +.mem.offset 8,0; st8.spill [r3]=r19,16;                \
24971 +       ;;                                      \
24972 +.mem.offset 0,0; st8.spill [r2]=r20,16;                \
24973 +.mem.offset 8,0; st8.spill [r3]=r21,16;                \
24974 +       mov r18=b6;                             \
24975 +       ;;                                      \
24976 +.mem.offset 0,0; st8.spill [r2]=r22,16;                \
24977 +.mem.offset 8,0; st8.spill [r3]=r23,16;                \
24978 +       mov r19=b7;                             \
24979 +       ;;                                      \
24980 +.mem.offset 0,0; st8.spill [r2]=r24,16;                \
24981 +.mem.offset 8,0; st8.spill [r3]=r25,16;                \
24982 +       ;;                                      \
24983 +.mem.offset 0,0; st8.spill [r2]=r26,16;                \
24984 +.mem.offset 8,0; st8.spill [r3]=r27,16;                \
24985 +       ;;                                      \
24986 +.mem.offset 0,0; st8.spill [r2]=r28,16;                \
24987 +.mem.offset 8,0; st8.spill [r3]=r29,16;                \
24988 +       ;;                                      \
24989 +.mem.offset 0,0; st8.spill [r2]=r30,16;                \
24990 +.mem.offset 8,0; st8.spill [r3]=r31,32;                \
24991 +       ;;                                      \
24992 +       mov ar.fpsr=r11;        /* M-unit */    \
24993 +       st8 [r2]=r8,8;          /* ar.ccv */    \
24994 +       adds r24=PT(B6)-PT(F7),r3;              \
24995 +       ;;                                      \
24996 +       stf.spill [r2]=f6,32;                   \
24997 +       stf.spill [r3]=f7,32;                   \
24998 +       ;;                                      \
24999 +       stf.spill [r2]=f8,32;                   \
25000 +       stf.spill [r3]=f9,32;                   \
25001 +       ;;                                      \
25002 +       stf.spill [r2]=f10;                     \
25003 +       stf.spill [r3]=f11;                     \
25004 +       adds r25=PT(B7)-PT(F11),r3;             \
25005 +       ;;                                      \
25006 +       st8 [r24]=r18,16;       /* b6 */        \
25007 +       st8 [r25]=r19,16;       /* b7 */        \
25008 +       ;;                                      \
25009 +       st8 [r24]=r9;           /* ar.csd */    \
25010 +       st8 [r25]=r10;          /* ar.ssd */    \
25011 +       ;;
25012 +
25013 +#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
25014 +#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
25015 +#ifdef CONFIG_XEN
25016 +#define SAVE_MIN               break 0;; /* FIXME: non-cover version only for ia32 support? */
25017 +#else
25018 +#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
25019 +#endif
25020 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xenpal.S tmp-linux-2.6-xen.patch/arch/ia64/xen/xenpal.S
25021 --- ref-linux-2.6.16.9/arch/ia64/xen/xenpal.S   1970-01-01 01:00:00.000000000 +0100
25022 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xenpal.S      2006-04-10 00:05:52.000000000 +0200
25023 @@ -0,0 +1,73 @@
25024 +/*
25025 + * ia64/xen/xenpal.S
25026 + *
25027 + * Alternate PAL  routines for Xen.  Heavily leveraged from
25028 + *   ia64/kernel/pal.S
25029 + *
25030 + * Copyright (C) 2005 Hewlett-Packard Co
25031 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
25032 + */
25033 +
25034 +#include <asm/asmmacro.h>
25035 +#include <asm/processor.h>
25036 +
25037 +GLOBAL_ENTRY(xen_pal_call_static)
25038 +       .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
25039 +       alloc loc1 = ar.pfs,5,5,0,0
25040 +#ifdef CONFIG_XEN
25041 +       movl r22=running_on_xen;;
25042 +       ld4 r22=[r22];;
25043 +       cmp.eq p7,p0=r22,r0
25044 +(p7)   br.cond.spnt.many __ia64_pal_call_static;;
25045 +#endif
25046 +       movl loc2 = pal_entry_point
25047 +1:     {
25048 +         mov r28 = in0
25049 +         mov r29 = in1
25050 +         mov r8 = ip
25051 +       }
25052 +       ;;
25053 +       ld8 loc2 = [loc2]               // loc2 <- entry point
25054 +       tbit.nz p6,p7 = in4, 0
25055 +       adds r8 = 1f-1b,r8
25056 +       mov loc4=ar.rsc                 // save RSE configuration
25057 +       ;;
25058 +       mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
25059 +       mov loc3 = psr
25060 +       mov loc0 = rp
25061 +       .body
25062 +       mov r30 = in2
25063 +
25064 +#ifdef CONFIG_XEN
25065 +       // this is low priority for paravirtualization, but is called
25066 +       // from the idle loop so confuses privop counting
25067 +       movl r31=XSI_PSR_IC
25068 +       ;;
25069 +(p6)   st8 [r31]=r0
25070 +       ;;
25071 +(p7)   adds r31=XSI_PSR_I-XSI_PSR_IC,r31
25072 +       ;;
25073 +(p7)   st4 [r31]=r0
25074 +       ;;
25075 +       mov r31 = in3
25076 +       mov b7 = loc2
25077 +       ;;
25078 +#else
25079 +(p6)   rsm psr.i | psr.ic
25080 +       mov r31 = in3
25081 +       mov b7 = loc2
25082 +
25083 +(p7)   rsm psr.i
25084 +       ;;
25085 +(p6)   srlz.i
25086 +#endif
25087 +       mov rp = r8
25088 +       br.cond.sptk.many b7
25089 +1:     mov psr.l = loc3
25090 +       mov ar.rsc = loc4               // restore RSE configuration
25091 +       mov ar.pfs = loc1
25092 +       mov rp = loc0
25093 +       ;;
25094 +       srlz.d                          // seralize restoration of psr.l
25095 +       br.ret.sptk.many b0
25096 +END(xen_pal_call_static)
25097 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen/xensetup.S tmp-linux-2.6-xen.patch/arch/ia64/xen/xensetup.S
25098 --- ref-linux-2.6.16.9/arch/ia64/xen/xensetup.S 1970-01-01 01:00:00.000000000 +0100
25099 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen/xensetup.S    2006-04-10 00:05:52.000000000 +0200
25100 @@ -0,0 +1,35 @@
25101 +/*
25102 + * Support routines for Xen
25103 + *
25104 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
25105 + */
25106 +
25107 +#include <linux/config.h>
25108 +#include <asm/processor.h>
25109 +#include <asm/asmmacro.h>
25110 +
25111 +       .data
25112 +       .align 8
25113 +       .globl running_on_xen
25114 +running_on_xen:
25115 +       data4 0
25116 +
25117 +#define isBP   p3      // are we the Bootstrap Processor?
25118 +
25119 +       .text
25120 +GLOBAL_ENTRY(early_xen_setup)
25121 +       mov r8=cr.dcr
25122 +(isBP) movl r9=running_on_xen;;
25123 +       extr.u r8=r8,63,1;;
25124 +       cmp.ne p7,p0=r8,r0;;
25125 +(isBP) st4 [r9]=r8
25126 +(p7)   movl r10=xen_ivt;;
25127 +(p7)   mov cr.iva=r10
25128 +       br.ret.sptk.many rp;;
25129 +END(early_xen_setup)
25130 +
25131 +GLOBAL_ENTRY(is_running_on_xen)
25132 +       movl r9=running_on_xen;;
25133 +       ld4 r8=[r9]
25134 +       br.ret.sptk.many rp;;
25135 +END(is_running_on_xen)
25136 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen-mkbuildtree-post tmp-linux-2.6-xen.patch/arch/ia64/xen-mkbuildtree-post
25137 --- ref-linux-2.6.16.9/arch/ia64/xen-mkbuildtree-post   1970-01-01 01:00:00.000000000 +0100
25138 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen-mkbuildtree-post      2006-04-10 00:05:52.000000000 +0200
25139 @@ -0,0 +1,2 @@
25140 +#!/bin/bash
25141 +echo 'NOTHING YET IN ' ${0}
25142 diff -Nurp ref-linux-2.6.16.9/arch/ia64/xen-mkbuildtree-pre tmp-linux-2.6-xen.patch/arch/ia64/xen-mkbuildtree-pre
25143 --- ref-linux-2.6.16.9/arch/ia64/xen-mkbuildtree-pre    1970-01-01 01:00:00.000000000 +0100
25144 +++ tmp-linux-2.6-xen.patch/arch/ia64/xen-mkbuildtree-pre       2006-04-10 00:05:52.000000000 +0200
25145 @@ -0,0 +1,50 @@
25146 +#!/bin/bash
25147 +# restructure directories to match future drivers/xen plan
25148 +# and move aside xen/x86 specific changes
25149 +# WARNING!: This directory movement really confuses hg which makes
25150 +# it difficult to do development in a directory which is being used
25151 +# for building (as all files in mv'd directories are thought by hg
25152 +# to have been deleted).  I don't know how to avoid this right now,
25153 +# but if someone has a better way, I'm all ears
25154 +if [ ! -e mm.xen-x86 ]
25155 +then
25156 +       mv mm mm.xen-x86
25157 +       mkdir mm
25158 +       mv net net.xen-x86
25159 +       mv kernel kernel.xen-x86
25160 +       mv drivers/acpi/tables.c drivers/acpi/tables.c.xen-x86
25161 +#      mv arch/xen/kernel drivers/xen/core
25162 +#      mv arch/xen arch/xen.xen-x86
25163 +#      mkdir arch/xen
25164 +#      mv arch/xen.xen-x86/configs arch/xen
25165 +#      mv include/asm-generic include/asm-generic.xen-x86
25166 +       mv include/linux include/linux.xen-x86
25167 +       mkdir include/linux
25168 +fi
25169 +
25170 +# need to grab a couple of xen-modified files for generic_page_range and
25171 +# typedef pte_fn_t which are used by driver/xen blkif
25172 +cp mm.xen-x86/memory.c mm/memory.c
25173 +cp include/linux.xen-x86/mm.h include/linux/mm.h
25174 +
25175 +#eventually asm-xsi-offsets needs to be part of hypervisor.h/hypercall.h
25176 +cp ../xen/include/asm-ia64/asm-xsi-offsets.h include/asm-ia64/xen/
25177 +
25178 +#ia64 drivers/xen isn't fully functional yet, workaround...
25179 +#also ignore core/evtchn.c which uses a different irq mechanism than ia64
25180 +#(warning: there be dragons here if these files diverge)
25181 +cp arch/ia64/xen/drivers/Makefile drivers/xen/Makefile
25182 +cp arch/ia64/xen/drivers/coreMakefile drivers/xen/core/Makefile
25183 +
25184 +#not sure where these ia64-specific files will end up in the future
25185 +cp arch/ia64/xen/drivers/xenia64_init.c drivers/xen/core
25186 +cp arch/ia64/xen/drivers/evtchn_ia64.c drivers/xen/core
25187 +
25188 +#still a few x86-ism's in various drivers/xen files, patch them
25189 +#cd drivers/xen
25190 +#if [ ! -e ia64.patch.semaphore ]
25191 +#then
25192 +#      cat ../../arch/ia64/xen/drivers/patches/* | patch -p1 -b
25193 +#fi
25194 +#touch ia64.patch.semaphore
25195 +#cd ../..
25196 diff -Nurp ref-linux-2.6.16.9/arch/um/kernel/physmem.c tmp-linux-2.6-xen.patch/arch/um/kernel/physmem.c
25197 --- ref-linux-2.6.16.9/arch/um/kernel/physmem.c 2006-04-19 08:10:14.000000000 +0200
25198 +++ tmp-linux-2.6-xen.patch/arch/um/kernel/physmem.c    2006-04-10 00:05:52.000000000 +0200
25199 @@ -225,7 +225,7 @@ EXPORT_SYMBOL(physmem_forget_descriptor)
25200  EXPORT_SYMBOL(physmem_remove_mapping);
25201  EXPORT_SYMBOL(physmem_subst_mapping);
25202  
25203 -void arch_free_page(struct page *page, int order)
25204 +int arch_free_page(struct page *page, int order)
25205  {
25206         void *virt;
25207         int i;
25208 @@ -234,6 +234,8 @@ void arch_free_page(struct page *page, i
25209                 virt = __va(page_to_phys(page + i));
25210                 physmem_remove_mapping(virt);
25211         }
25212 +
25213 +       return 0;
25214  }
25215  
25216  int is_remapped(void *virt)
25217 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/ia32/ia32entry-xen.S tmp-linux-2.6-xen.patch/arch/x86_64/ia32/ia32entry-xen.S
25218 --- ref-linux-2.6.16.9/arch/x86_64/ia32/ia32entry-xen.S 1970-01-01 01:00:00.000000000 +0100
25219 +++ tmp-linux-2.6-xen.patch/arch/x86_64/ia32/ia32entry-xen.S    2006-04-10 00:05:52.000000000 +0200
25220 @@ -0,0 +1,721 @@
25221 +/*
25222 + * Compatibility mode system call entry point for x86-64. 
25223 + *             
25224 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
25225 + */             
25226 +
25227 +#include <asm/dwarf2.h>
25228 +#include <asm/calling.h>
25229 +#include <asm/asm-offsets.h>
25230 +#include <asm/current.h>
25231 +#include <asm/errno.h>
25232 +#include <asm/ia32_unistd.h>   
25233 +#include <asm/thread_info.h>   
25234 +#include <asm/segment.h>
25235 +#include <asm/vsyscall32.h>
25236 +#include <linux/linkage.h>
25237 +
25238 +#define __XEN_X86_64 1
25239 +       
25240 +       .macro IA32_ARG_FIXUP noebp=0
25241 +       movl    %edi,%r8d
25242 +       .if \noebp
25243 +       .else
25244 +       movl    %ebp,%r9d
25245 +       .endif
25246 +       xchg    %ecx,%esi
25247 +       movl    %ebx,%edi
25248 +       movl    %edx,%edx       /* zero extension */
25249 +       .endm 
25250 +
25251 +       /* clobbers %eax */     
25252 +       .macro  CLEAR_RREGS
25253 +       xorl    %eax,%eax
25254 +       movq    %rax,R11(%rsp)
25255 +       movq    %rax,R10(%rsp)
25256 +       movq    %rax,R9(%rsp)
25257 +       movq    %rax,R8(%rsp)
25258 +       .endm
25259 +
25260 +#if defined (__XEN_X86_64)
25261 +#include "../kernel/xen_entry.S"
25262 +               
25263 +#define        __swapgs
25264 +#define __cli
25265 +#define __sti  
25266 +#else
25267 +/*
25268 + * Use the native instructions
25269 + */    
25270 +#define        __swapgs        swapgs
25271 +#define __cli          cli
25272 +#define __sti          sti     
25273 +#endif                 
25274 +
25275 +       .macro CFI_STARTPROC32 simple
25276 +       CFI_STARTPROC   \simple
25277 +       CFI_UNDEFINED   r8
25278 +       CFI_UNDEFINED   r9
25279 +       CFI_UNDEFINED   r10
25280 +       CFI_UNDEFINED   r11
25281 +       CFI_UNDEFINED   r12
25282 +       CFI_UNDEFINED   r13
25283 +       CFI_UNDEFINED   r14
25284 +       CFI_UNDEFINED   r15
25285 +       .endm
25286 +
25287 +/*
25288 + * 32bit SYSENTER instruction entry.
25289 + *
25290 + * Arguments:
25291 + * %eax        System call number.
25292 + * %ebx Arg1
25293 + * %ecx Arg2
25294 + * %edx Arg3
25295 + * %esi Arg4
25296 + * %edi Arg5
25297 + * %ebp user stack
25298 + * 0(%ebp) Arg6        
25299 + *     
25300 + * Interrupts off.
25301 + *     
25302 + * This is purely a fast path. For anything complicated we use the int 0x80
25303 + * path below. Set up a complete hardware stack frame to share code
25304 + * with the int 0x80 path.
25305 + */    
25306 +ENTRY(ia32_sysenter_target)
25307 +       CFI_STARTPROC32 simple
25308 +       CFI_DEF_CFA     rsp,0
25309 +       CFI_REGISTER    rsp,rbp
25310 +       __swapgs 
25311 +       movq    %gs:pda_kernelstack, %rsp
25312 +       addq    $(PDA_STACKOFFSET),%rsp
25313 +       XEN_UNBLOCK_EVENTS(%r11)        
25314 +       __sti
25315 +       movl    %ebp,%ebp               /* zero extension */
25316 +       pushq   $__USER32_DS
25317 +       CFI_ADJUST_CFA_OFFSET 8
25318 +       /*CFI_REL_OFFSET ss,0*/
25319 +       pushq   %rbp
25320 +       CFI_ADJUST_CFA_OFFSET 8
25321 +       CFI_REL_OFFSET rsp,0
25322 +       pushfq
25323 +       CFI_ADJUST_CFA_OFFSET 8
25324 +       /*CFI_REL_OFFSET rflags,0*/
25325 +       movl    $VSYSCALL32_SYSEXIT, %r10d
25326 +       CFI_REGISTER rip,r10
25327 +       pushq   $__USER32_CS
25328 +       CFI_ADJUST_CFA_OFFSET 8
25329 +       /*CFI_REL_OFFSET cs,0*/
25330 +       movl    %eax, %eax
25331 +       pushq   %r10
25332 +       CFI_ADJUST_CFA_OFFSET 8
25333 +       CFI_REL_OFFSET rip,0
25334 +       pushq   %rax
25335 +       CFI_ADJUST_CFA_OFFSET 8
25336 +       cld
25337 +       SAVE_ARGS 0,0,1
25338 +       /* no need to do an access_ok check here because rbp has been
25339 +          32bit zero extended */ 
25340 +1:     movl    (%rbp),%r9d
25341 +       .section __ex_table,"a"
25342 +       .quad 1b,ia32_badarg
25343 +       .previous       
25344 +       GET_THREAD_INFO(%r10)
25345 +       orl    $TS_COMPAT,threadinfo_status(%r10)
25346 +       testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
25347 +       CFI_REMEMBER_STATE
25348 +       jnz  sysenter_tracesys
25349 +sysenter_do_call:      
25350 +       cmpl    $(IA32_NR_syscalls),%eax
25351 +       jae     ia32_badsys
25352 +       IA32_ARG_FIXUP 1
25353 +       call    *ia32_sys_call_table(,%rax,8)
25354 +       movq    %rax,RAX-ARGOFFSET(%rsp)
25355 +       GET_THREAD_INFO(%r10)
25356 +       XEN_BLOCK_EVENTS(%r11)  
25357 +       __cli
25358 +       testl   $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
25359 +       jnz     int_ret_from_sys_call
25360 +       andl    $~TS_COMPAT,threadinfo_status(%r10)
25361 +       /* clear IF, that popfq doesn't enable interrupts early */
25362 +       andl  $~0x200,EFLAGS-R11(%rsp) 
25363 +       RESTORE_ARGS 1,24,1,1,1,1
25364 +       popfq
25365 +       CFI_ADJUST_CFA_OFFSET -8
25366 +       /*CFI_RESTORE rflags*/
25367 +       popq    %rcx                            /* User %esp */
25368 +       CFI_ADJUST_CFA_OFFSET -8
25369 +       CFI_REGISTER rsp,rcx
25370 +       movl    $VSYSCALL32_SYSEXIT,%edx        /* User %eip */
25371 +       CFI_REGISTER rip,rdx
25372 +       __swapgs
25373 +       XEN_UNBLOCK_EVENTS(%r11)                
25374 +       __sti           /* sti only takes effect after the next instruction */
25375 +       /* sysexit */
25376 +       .byte   0xf, 0x35  /* TBD */
25377 +
25378 +sysenter_tracesys:
25379 +       CFI_RESTORE_STATE
25380 +       SAVE_REST
25381 +       CLEAR_RREGS
25382 +       movq    $-ENOSYS,RAX(%rsp)      /* really needed? */
25383 +       movq    %rsp,%rdi        /* &pt_regs -> arg1 */
25384 +       call    syscall_trace_enter
25385 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
25386 +       RESTORE_REST
25387 +       movl    %ebp, %ebp
25388 +       /* no need to do an access_ok check here because rbp has been
25389 +          32bit zero extended */ 
25390 +1:     movl    (%rbp),%r9d
25391 +       .section __ex_table,"a"
25392 +       .quad 1b,ia32_badarg
25393 +       .previous
25394 +       jmp     sysenter_do_call
25395 +       CFI_ENDPROC
25396 +
25397 +/*
25398 + * 32bit SYSCALL instruction entry.
25399 + *
25400 + * Arguments:
25401 + * %eax        System call number.
25402 + * %ebx Arg1
25403 + * %ecx return EIP 
25404 + * %edx Arg3
25405 + * %esi Arg4
25406 + * %edi Arg5
25407 + * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
25408 + * %esp user stack 
25409 + * 0(%esp) Arg6
25410 + *     
25411 + * Interrupts off.
25412 + *     
25413 + * This is purely a fast path. For anything complicated we use the int 0x80
25414 + * path below. Set up a complete hardware stack frame to share code
25415 + * with the int 0x80 path.     
25416 + */    
25417 +ENTRY(ia32_cstar_target)
25418 +       CFI_STARTPROC32 simple
25419 +       CFI_DEF_CFA     rsp,0
25420 +       CFI_REGISTER    rip,rcx
25421 +       /*CFI_REGISTER  rflags,r11*/
25422 +       __swapgs
25423 +       movl    %esp,%r8d
25424 +       CFI_REGISTER    rsp,r8
25425 +       movq    %gs:pda_kernelstack,%rsp
25426 +       XEN_UNBLOCK_EVENTS(%r11)        
25427 +       __sti
25428 +       SAVE_ARGS 8,1,1
25429 +       movl    %eax,%eax       /* zero extension */
25430 +       movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
25431 +       movq    %rcx,RIP-ARGOFFSET(%rsp)
25432 +       CFI_REL_OFFSET rip,RIP-ARGOFFSET
25433 +       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
25434 +       movl    %ebp,%ecx
25435 +       movq    $__USER32_CS,CS-ARGOFFSET(%rsp)
25436 +       movq    $__USER32_DS,SS-ARGOFFSET(%rsp)
25437 +       movq    %r11,EFLAGS-ARGOFFSET(%rsp)
25438 +       /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
25439 +       movq    %r8,RSP-ARGOFFSET(%rsp) 
25440 +       CFI_REL_OFFSET rsp,RSP-ARGOFFSET
25441 +       /* no need to do an access_ok check here because r8 has been
25442 +          32bit zero extended */ 
25443 +       /* hardware stack frame is complete now */      
25444 +1:     movl    (%r8),%r9d
25445 +       .section __ex_table,"a"
25446 +       .quad 1b,ia32_badarg
25447 +       .previous       
25448 +       GET_THREAD_INFO(%r10)
25449 +       orl   $TS_COMPAT,threadinfo_status(%r10)
25450 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
25451 +       CFI_REMEMBER_STATE
25452 +       jnz   cstar_tracesys
25453 +cstar_do_call: 
25454 +       cmpl $IA32_NR_syscalls,%eax
25455 +       jae  ia32_badsys
25456 +       IA32_ARG_FIXUP 1
25457 +       call *ia32_sys_call_table(,%rax,8)
25458 +       movq %rax,RAX-ARGOFFSET(%rsp)
25459 +       GET_THREAD_INFO(%r10)
25460 +       XEN_BLOCK_EVENTS(%r11)          
25461 +       __cli
25462 +       testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
25463 +       jnz  int_ret_from_sys_call
25464 +       andl $~TS_COMPAT,threadinfo_status(%r10)
25465 +       RESTORE_ARGS 1,-ARG_SKIP,1,1,1
25466 +       movl RIP-ARGOFFSET(%rsp),%ecx
25467 +       CFI_REGISTER rip,rcx
25468 +       movl EFLAGS-ARGOFFSET(%rsp),%r11d       
25469 +       /*CFI_REGISTER rflags,r11*/
25470 +       movl RSP-ARGOFFSET(%rsp),%esp
25471 +       CFI_RESTORE rsp
25472 +       __swapgs
25473 +       sysretl  /* TBD */
25474 +       
25475 +cstar_tracesys:        
25476 +       CFI_RESTORE_STATE
25477 +       SAVE_REST
25478 +       CLEAR_RREGS
25479 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
25480 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
25481 +       call syscall_trace_enter
25482 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
25483 +       RESTORE_REST
25484 +       movl RSP-ARGOFFSET(%rsp), %r8d
25485 +       /* no need to do an access_ok check here because r8 has been
25486 +          32bit zero extended */ 
25487 +1:     movl    (%r8),%r9d
25488 +       .section __ex_table,"a"
25489 +       .quad 1b,ia32_badarg
25490 +       .previous
25491 +       jmp cstar_do_call
25492 +                               
25493 +ia32_badarg:
25494 +       movq $-EFAULT,%rax
25495 +       jmp ia32_sysret
25496 +       CFI_ENDPROC
25497 +
25498 +/* 
25499 + * Emulated IA32 system calls via int 0x80. 
25500 + *
25501 + * Arguments:   
25502 + * %eax        System call number.
25503 + * %ebx Arg1
25504 + * %ecx Arg2
25505 + * %edx Arg3
25506 + * %esi Arg4
25507 + * %edi Arg5
25508 + * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
25509 + *
25510 + * Notes:
25511 + * Uses the same stack frame as the x86-64 version.    
25512 + * All registers except %eax must be saved (but ptrace may violate that)
25513 + * Arguments are zero extended. For system calls that want sign extension and
25514 + * take long arguments a wrapper is needed. Most calls can just be called
25515 + * directly.
25516 + * Assumes it is only called from user space and entered with interrupts off.  
25517 + */                            
25518 +
25519 +ENTRY(ia32_syscall)
25520 +       CFI_STARTPROC   simple
25521 +       CFI_DEF_CFA     rsp,SS+8-RIP
25522 +       /*CFI_REL_OFFSET        ss,SS-RIP*/
25523 +       CFI_REL_OFFSET  rsp,RSP-RIP
25524 +       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
25525 +       /*CFI_REL_OFFSET        cs,CS-RIP*/
25526 +       CFI_REL_OFFSET  rip,RIP-RIP
25527 +       __swapgs
25528 +       XEN_UNBLOCK_EVENTS(%r11)
25529 +       __sti
25530 +       movq (%rsp),%rcx
25531 +       movq 8(%rsp),%r11
25532 +        addq $0x10,%rsp /* skip rcx and r11 */
25533 +       movl %eax,%eax
25534 +       pushq %rax
25535 +       CFI_ADJUST_CFA_OFFSET 8
25536 +       cld
25537 +/* 1:  jmp 1b   */
25538 +       /* note the registers are not zero extended to the sf.
25539 +          this could be a problem. */
25540 +       SAVE_ARGS 0,0,1
25541 +       GET_THREAD_INFO(%r10)
25542 +       orl   $TS_COMPAT,threadinfo_status(%r10)
25543 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
25544 +       jnz ia32_tracesys
25545 +ia32_do_syscall:       
25546 +       cmpl $(IA32_NR_syscalls),%eax
25547 +       jae  ia32_badsys
25548 +       IA32_ARG_FIXUP
25549 +       call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
25550 +ia32_sysret:
25551 +       movq %rax,RAX-ARGOFFSET(%rsp)
25552 +       jmp int_ret_from_sys_call 
25553 +
25554 +ia32_tracesys:                  
25555 +       SAVE_REST
25556 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
25557 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
25558 +       call syscall_trace_enter
25559 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
25560 +       RESTORE_REST
25561 +       jmp ia32_do_syscall
25562 +
25563 +ia32_badsys:
25564 +       movq $0,ORIG_RAX-ARGOFFSET(%rsp)
25565 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
25566 +       jmp int_ret_from_sys_call
25567 +
25568 +ni_syscall:
25569 +       movq %rax,%rdi
25570 +       jmp  sys32_ni_syscall                   
25571 +
25572 +quiet_ni_syscall:
25573 +       movq $-ENOSYS,%rax
25574 +       ret
25575 +       CFI_ENDPROC
25576 +       
25577 +       .macro PTREGSCALL label, func, arg
25578 +       .globl \label
25579 +\label:
25580 +       leaq \func(%rip),%rax
25581 +       leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
25582 +       jmp  ia32_ptregs_common 
25583 +       .endm
25584 +
25585 +       CFI_STARTPROC32
25586 +
25587 +       PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
25588 +       PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
25589 +       PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
25590 +       PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
25591 +       PTREGSCALL stub32_execve, sys32_execve, %rcx
25592 +       PTREGSCALL stub32_fork, sys_fork, %rdi
25593 +       PTREGSCALL stub32_clone, sys32_clone, %rdx
25594 +       PTREGSCALL stub32_vfork, sys_vfork, %rdi
25595 +       PTREGSCALL stub32_iopl, sys_iopl, %rsi
25596 +       PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
25597 +
25598 +ENTRY(ia32_ptregs_common)
25599 +       popq %r11
25600 +       CFI_ENDPROC
25601 +       CFI_STARTPROC32 simple
25602 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
25603 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
25604 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
25605 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
25606 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
25607 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
25608 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
25609 +/*     CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
25610 +/*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
25611 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
25612 +/*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
25613 +       SAVE_REST
25614 +       call *%rax
25615 +       RESTORE_REST
25616 +       jmp  ia32_sysret        /* misbalances the return cache */
25617 +       CFI_ENDPROC
25618 +
25619 +       .section .rodata,"a"
25620 +       .align 8
25621 +       .globl ia32_sys_call_table
25622 +ia32_sys_call_table:
25623 +       .quad sys_restart_syscall
25624 +       .quad sys_exit
25625 +       .quad stub32_fork
25626 +       .quad sys_read
25627 +       .quad sys_write
25628 +       .quad compat_sys_open           /* 5 */
25629 +       .quad sys_close
25630 +       .quad sys32_waitpid
25631 +       .quad sys_creat
25632 +       .quad sys_link
25633 +       .quad sys_unlink                /* 10 */
25634 +       .quad stub32_execve
25635 +       .quad sys_chdir
25636 +       .quad compat_sys_time
25637 +       .quad sys_mknod
25638 +       .quad sys_chmod         /* 15 */
25639 +       .quad sys_lchown16
25640 +       .quad quiet_ni_syscall                  /* old break syscall holder */
25641 +       .quad sys_stat
25642 +       .quad sys32_lseek
25643 +       .quad sys_getpid                /* 20 */
25644 +       .quad compat_sys_mount  /* mount  */
25645 +       .quad sys_oldumount     /* old_umount  */
25646 +       .quad sys_setuid16
25647 +       .quad sys_getuid16
25648 +       .quad compat_sys_stime  /* stime */             /* 25 */
25649 +       .quad sys32_ptrace      /* ptrace */
25650 +       .quad sys_alarm
25651 +       .quad sys_fstat /* (old)fstat */
25652 +       .quad sys_pause
25653 +       .quad compat_sys_utime  /* 30 */
25654 +       .quad quiet_ni_syscall  /* old stty syscall holder */
25655 +       .quad quiet_ni_syscall  /* old gtty syscall holder */
25656 +       .quad sys_access
25657 +       .quad sys_nice  
25658 +       .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
25659 +       .quad sys_sync
25660 +       .quad sys32_kill
25661 +       .quad sys_rename
25662 +       .quad sys_mkdir
25663 +       .quad sys_rmdir         /* 40 */
25664 +       .quad sys_dup
25665 +       .quad sys32_pipe
25666 +       .quad compat_sys_times
25667 +       .quad quiet_ni_syscall                  /* old prof syscall holder */
25668 +       .quad sys_brk           /* 45 */
25669 +       .quad sys_setgid16
25670 +       .quad sys_getgid16
25671 +       .quad sys_signal
25672 +       .quad sys_geteuid16
25673 +       .quad sys_getegid16     /* 50 */
25674 +       .quad sys_acct
25675 +       .quad sys_umount                        /* new_umount */
25676 +       .quad quiet_ni_syscall                  /* old lock syscall holder */
25677 +       .quad compat_sys_ioctl
25678 +       .quad compat_sys_fcntl64                /* 55 */
25679 +       .quad quiet_ni_syscall                  /* old mpx syscall holder */
25680 +       .quad sys_setpgid
25681 +       .quad quiet_ni_syscall                  /* old ulimit syscall holder */
25682 +       .quad sys32_olduname
25683 +       .quad sys_umask         /* 60 */
25684 +       .quad sys_chroot
25685 +       .quad sys32_ustat
25686 +       .quad sys_dup2
25687 +       .quad sys_getppid
25688 +       .quad sys_getpgrp               /* 65 */
25689 +       .quad sys_setsid
25690 +       .quad sys32_sigaction
25691 +       .quad sys_sgetmask
25692 +       .quad sys_ssetmask
25693 +       .quad sys_setreuid16    /* 70 */
25694 +       .quad sys_setregid16
25695 +       .quad stub32_sigsuspend
25696 +       .quad compat_sys_sigpending
25697 +       .quad sys_sethostname
25698 +       .quad compat_sys_setrlimit      /* 75 */
25699 +       .quad compat_sys_old_getrlimit  /* old_getrlimit */
25700 +       .quad compat_sys_getrusage
25701 +       .quad sys32_gettimeofday
25702 +       .quad sys32_settimeofday
25703 +       .quad sys_getgroups16   /* 80 */
25704 +       .quad sys_setgroups16
25705 +       .quad sys32_old_select
25706 +       .quad sys_symlink
25707 +       .quad sys_lstat
25708 +       .quad sys_readlink              /* 85 */
25709 +#ifdef CONFIG_IA32_AOUT
25710 +       .quad sys_uselib
25711 +#else
25712 +       .quad quiet_ni_syscall
25713 +#endif
25714 +       .quad sys_swapon
25715 +       .quad sys_reboot
25716 +       .quad compat_sys_old_readdir
25717 +       .quad sys32_mmap                /* 90 */
25718 +       .quad sys_munmap
25719 +       .quad sys_truncate
25720 +       .quad sys_ftruncate
25721 +       .quad sys_fchmod
25722 +       .quad sys_fchown16              /* 95 */
25723 +       .quad sys_getpriority
25724 +       .quad sys_setpriority
25725 +       .quad quiet_ni_syscall                  /* old profil syscall holder */
25726 +       .quad compat_sys_statfs
25727 +       .quad compat_sys_fstatfs                /* 100 */
25728 +       .quad sys_ioperm
25729 +       .quad compat_sys_socketcall
25730 +       .quad sys_syslog
25731 +       .quad compat_sys_setitimer
25732 +       .quad compat_sys_getitimer      /* 105 */
25733 +       .quad compat_sys_newstat
25734 +       .quad compat_sys_newlstat
25735 +       .quad compat_sys_newfstat
25736 +       .quad sys32_uname
25737 +       .quad stub32_iopl               /* 110 */
25738 +       .quad sys_vhangup
25739 +       .quad quiet_ni_syscall  /* old "idle" system call */
25740 +       .quad sys32_vm86_warning        /* vm86old */ 
25741 +       .quad compat_sys_wait4
25742 +       .quad sys_swapoff               /* 115 */
25743 +       .quad sys32_sysinfo
25744 +       .quad sys32_ipc
25745 +       .quad sys_fsync
25746 +       .quad stub32_sigreturn
25747 +       .quad stub32_clone              /* 120 */
25748 +       .quad sys_setdomainname
25749 +       .quad sys_uname
25750 +       .quad sys_modify_ldt
25751 +       .quad sys32_adjtimex
25752 +       .quad sys32_mprotect            /* 125 */
25753 +       .quad compat_sys_sigprocmask
25754 +       .quad quiet_ni_syscall          /* create_module */
25755 +       .quad sys_init_module
25756 +       .quad sys_delete_module
25757 +       .quad quiet_ni_syscall          /* 130  get_kernel_syms */
25758 +       .quad sys_quotactl
25759 +       .quad sys_getpgid
25760 +       .quad sys_fchdir
25761 +       .quad quiet_ni_syscall  /* bdflush */
25762 +       .quad sys_sysfs         /* 135 */
25763 +       .quad sys_personality
25764 +       .quad quiet_ni_syscall  /* for afs_syscall */
25765 +       .quad sys_setfsuid16
25766 +       .quad sys_setfsgid16
25767 +       .quad sys_llseek                /* 140 */
25768 +       .quad compat_sys_getdents
25769 +       .quad compat_sys_select
25770 +       .quad sys_flock
25771 +       .quad sys_msync
25772 +       .quad compat_sys_readv          /* 145 */
25773 +       .quad compat_sys_writev
25774 +       .quad sys_getsid
25775 +       .quad sys_fdatasync
25776 +       .quad sys32_sysctl      /* sysctl */
25777 +       .quad sys_mlock         /* 150 */
25778 +       .quad sys_munlock
25779 +       .quad sys_mlockall
25780 +       .quad sys_munlockall
25781 +       .quad sys_sched_setparam
25782 +       .quad sys_sched_getparam   /* 155 */
25783 +       .quad sys_sched_setscheduler
25784 +       .quad sys_sched_getscheduler
25785 +       .quad sys_sched_yield
25786 +       .quad sys_sched_get_priority_max
25787 +       .quad sys_sched_get_priority_min  /* 160 */
25788 +       .quad sys_sched_rr_get_interval
25789 +       .quad compat_sys_nanosleep
25790 +       .quad sys_mremap
25791 +       .quad sys_setresuid16
25792 +       .quad sys_getresuid16   /* 165 */
25793 +       .quad sys32_vm86_warning        /* vm86 */ 
25794 +       .quad quiet_ni_syscall  /* query_module */
25795 +       .quad sys_poll
25796 +       .quad compat_sys_nfsservctl
25797 +       .quad sys_setresgid16   /* 170 */
25798 +       .quad sys_getresgid16
25799 +       .quad sys_prctl
25800 +       .quad stub32_rt_sigreturn
25801 +       .quad sys32_rt_sigaction
25802 +       .quad sys32_rt_sigprocmask      /* 175 */
25803 +       .quad sys32_rt_sigpending
25804 +       .quad compat_sys_rt_sigtimedwait
25805 +       .quad sys32_rt_sigqueueinfo
25806 +       .quad stub32_rt_sigsuspend
25807 +       .quad sys32_pread               /* 180 */
25808 +       .quad sys32_pwrite
25809 +       .quad sys_chown16
25810 +       .quad sys_getcwd
25811 +       .quad sys_capget
25812 +       .quad sys_capset
25813 +       .quad stub32_sigaltstack
25814 +       .quad sys32_sendfile
25815 +       .quad quiet_ni_syscall          /* streams1 */
25816 +       .quad quiet_ni_syscall          /* streams2 */
25817 +       .quad stub32_vfork            /* 190 */
25818 +       .quad compat_sys_getrlimit
25819 +       .quad sys32_mmap2
25820 +       .quad sys32_truncate64
25821 +       .quad sys32_ftruncate64
25822 +       .quad sys32_stat64              /* 195 */
25823 +       .quad sys32_lstat64
25824 +       .quad sys32_fstat64
25825 +       .quad sys_lchown
25826 +       .quad sys_getuid
25827 +       .quad sys_getgid                /* 200 */
25828 +       .quad sys_geteuid
25829 +       .quad sys_getegid
25830 +       .quad sys_setreuid
25831 +       .quad sys_setregid
25832 +       .quad sys_getgroups     /* 205 */
25833 +       .quad sys_setgroups
25834 +       .quad sys_fchown
25835 +       .quad sys_setresuid
25836 +       .quad sys_getresuid
25837 +       .quad sys_setresgid     /* 210 */
25838 +       .quad sys_getresgid
25839 +       .quad sys_chown
25840 +       .quad sys_setuid
25841 +       .quad sys_setgid
25842 +       .quad sys_setfsuid              /* 215 */
25843 +       .quad sys_setfsgid
25844 +       .quad sys_pivot_root
25845 +       .quad sys_mincore
25846 +       .quad sys_madvise
25847 +       .quad compat_sys_getdents64     /* 220 getdents64 */
25848 +       .quad compat_sys_fcntl64        
25849 +       .quad quiet_ni_syscall          /* tux */
25850 +       .quad quiet_ni_syscall          /* security */
25851 +       .quad sys_gettid        
25852 +       .quad sys_readahead     /* 225 */
25853 +       .quad sys_setxattr
25854 +       .quad sys_lsetxattr
25855 +       .quad sys_fsetxattr
25856 +       .quad sys_getxattr
25857 +       .quad sys_lgetxattr     /* 230 */
25858 +       .quad sys_fgetxattr
25859 +       .quad sys_listxattr
25860 +       .quad sys_llistxattr
25861 +       .quad sys_flistxattr
25862 +       .quad sys_removexattr   /* 235 */
25863 +       .quad sys_lremovexattr
25864 +       .quad sys_fremovexattr
25865 +       .quad sys_tkill
25866 +       .quad sys_sendfile64 
25867 +       .quad compat_sys_futex          /* 240 */
25868 +       .quad compat_sys_sched_setaffinity
25869 +       .quad compat_sys_sched_getaffinity
25870 +       .quad sys32_set_thread_area
25871 +       .quad sys32_get_thread_area
25872 +       .quad compat_sys_io_setup       /* 245 */
25873 +       .quad sys_io_destroy
25874 +       .quad compat_sys_io_getevents
25875 +       .quad compat_sys_io_submit
25876 +       .quad sys_io_cancel
25877 +       .quad sys_fadvise64             /* 250 */
25878 +       .quad quiet_ni_syscall  /* free_huge_pages */
25879 +       .quad sys_exit_group
25880 +       .quad sys32_lookup_dcookie
25881 +       .quad sys_epoll_create
25882 +       .quad sys_epoll_ctl             /* 255 */
25883 +       .quad sys_epoll_wait
25884 +       .quad sys_remap_file_pages
25885 +       .quad sys_set_tid_address
25886 +       .quad compat_sys_timer_create
25887 +       .quad compat_sys_timer_settime  /* 260 */
25888 +       .quad compat_sys_timer_gettime
25889 +       .quad sys_timer_getoverrun
25890 +       .quad sys_timer_delete
25891 +       .quad compat_sys_clock_settime
25892 +       .quad compat_sys_clock_gettime  /* 265 */
25893 +       .quad compat_sys_clock_getres
25894 +       .quad compat_sys_clock_nanosleep
25895 +       .quad compat_sys_statfs64
25896 +       .quad compat_sys_fstatfs64
25897 +       .quad sys_tgkill                /* 270 */
25898 +       .quad compat_sys_utimes
25899 +       .quad sys32_fadvise64_64
25900 +       .quad quiet_ni_syscall  /* sys_vserver */
25901 +       .quad sys_mbind
25902 +       .quad compat_sys_get_mempolicy  /* 275 */
25903 +       .quad sys_set_mempolicy
25904 +       .quad compat_sys_mq_open
25905 +       .quad sys_mq_unlink
25906 +       .quad compat_sys_mq_timedsend
25907 +       .quad compat_sys_mq_timedreceive        /* 280 */
25908 +       .quad compat_sys_mq_notify
25909 +       .quad compat_sys_mq_getsetattr
25910 +       .quad compat_sys_kexec_load     /* reserved for kexec */
25911 +       .quad compat_sys_waitid
25912 +       .quad quiet_ni_syscall          /* 285: sys_altroot */
25913 +       .quad sys_add_key
25914 +       .quad sys_request_key
25915 +       .quad sys_keyctl
25916 +       .quad sys_ioprio_set
25917 +       .quad sys_ioprio_get            /* 290 */
25918 +       .quad sys_inotify_init
25919 +       .quad sys_inotify_add_watch
25920 +       .quad sys_inotify_rm_watch
25921 +       .quad sys_migrate_pages
25922 +       .quad compat_sys_openat         /* 295 */
25923 +       .quad sys_mkdirat
25924 +       .quad sys_mknodat
25925 +       .quad sys_fchownat
25926 +       .quad compat_sys_futimesat
25927 +       .quad sys32_fstatat             /* 300 */
25928 +       .quad sys_unlinkat
25929 +       .quad sys_renameat
25930 +       .quad sys_linkat
25931 +       .quad sys_symlinkat
25932 +       .quad sys_readlinkat            /* 305 */
25933 +       .quad sys_fchmodat
25934 +       .quad sys_faccessat
25935 +       .quad sys_ni_syscall            /* pselect6 for now */
25936 +       .quad sys_ni_syscall            /* ppoll for now */
25937 +       .quad sys_unshare               /* 310 */
25938 +ia32_syscall_end:              
25939 +       .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
25940 +               .quad ni_syscall
25941 +       .endr
25942 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/ia32/Makefile tmp-linux-2.6-xen.patch/arch/x86_64/ia32/Makefile
25943 --- ref-linux-2.6.16.9/arch/x86_64/ia32/Makefile        2006-04-19 08:10:14.000000000 +0200
25944 +++ tmp-linux-2.6-xen.patch/arch/x86_64/ia32/Makefile   2006-04-10 00:05:52.000000000 +0200
25945 @@ -23,9 +23,25 @@ quiet_cmd_syscall = SYSCALL $@
25946                            -Wl,-soname=linux-gate.so.1 -o $@ \
25947                            -Wl,-T,$(filter-out FORCE,$^)
25948  
25949 +$(obj)/vsyscall-int80.so \
25950  $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
25951  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
25952         $(call if_changed,syscall)
25953  
25954 -AFLAGS_vsyscall-sysenter.o = -m32
25955 -AFLAGS_vsyscall-syscall.o = -m32
25956 +AFLAGS_vsyscall-sysenter.o = -m32 -Iarch/i386/kernel
25957 +AFLAGS_vsyscall-syscall.o = -m32 -Iarch/i386/kernel
25958 +
25959 +ifdef CONFIG_XEN
25960 +AFLAGS_vsyscall-int80.o = -m32 -Iarch/i386/kernel
25961 +CFLAGS_syscall32-xen.o += -DUSE_INT80
25962 +AFLAGS_syscall32_syscall-xen.o += -DUSE_INT80
25963 +
25964 +$(obj)/syscall32_syscall-xen.o: \
25965 +       $(foreach F,int80 sysenter syscall,$(obj)/vsyscall-$F.so)
25966 +
25967 +targets := $(foreach F,int80 sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
25968 +
25969 +include $(srctree)/scripts/Makefile.xen
25970 +
25971 +obj-y := $(call cherrypickxen, $(obj-y))
25972 +endif
25973 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/ia32/syscall32_syscall-xen.S tmp-linux-2.6-xen.patch/arch/x86_64/ia32/syscall32_syscall-xen.S
25974 --- ref-linux-2.6.16.9/arch/x86_64/ia32/syscall32_syscall-xen.S 1970-01-01 01:00:00.000000000 +0100
25975 +++ tmp-linux-2.6-xen.patch/arch/x86_64/ia32/syscall32_syscall-xen.S    2006-04-10 00:05:52.000000000 +0200
25976 @@ -0,0 +1,28 @@
25977 +/* 32bit VDSOs mapped into user space. */
25978 +
25979 +       .section ".init.data","aw"
25980 +
25981 +#ifdef USE_INT80
25982 +
25983 +       .globl syscall32_int80
25984 +       .globl syscall32_int80_end
25985 +
25986 +syscall32_int80:
25987 +       .incbin "arch/x86_64/ia32/vsyscall-int80.so"
25988 +syscall32_int80_end:
25989 +
25990 +#endif
25991 +
25992 +       .globl syscall32_syscall
25993 +       .globl syscall32_syscall_end
25994 +
25995 +syscall32_syscall:
25996 +       .incbin "arch/x86_64/ia32/vsyscall-syscall.so"
25997 +syscall32_syscall_end:
25998 +
25999 +       .globl syscall32_sysenter
26000 +       .globl syscall32_sysenter_end
26001 +
26002 +syscall32_sysenter:
26003 +       .incbin "arch/x86_64/ia32/vsyscall-sysenter.so"
26004 +syscall32_sysenter_end:
26005 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/ia32/syscall32-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/ia32/syscall32-xen.c
26006 --- ref-linux-2.6.16.9/arch/x86_64/ia32/syscall32-xen.c 1970-01-01 01:00:00.000000000 +0100
26007 +++ tmp-linux-2.6-xen.patch/arch/x86_64/ia32/syscall32-xen.c    2006-04-10 00:05:52.000000000 +0200
26008 @@ -0,0 +1,128 @@
26009 +/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
26010 +
26011 +/* vsyscall handling for 32bit processes. Map a stub page into it 
26012 +   on demand because 32bit cannot reach the kernel's fixmaps */
26013 +
26014 +#include <linux/mm.h>
26015 +#include <linux/string.h>
26016 +#include <linux/kernel.h>
26017 +#include <linux/gfp.h>
26018 +#include <linux/init.h>
26019 +#include <linux/stringify.h>
26020 +#include <linux/security.h>
26021 +#include <asm/proto.h>
26022 +#include <asm/tlbflush.h>
26023 +#include <asm/ia32_unistd.h>
26024 +
26025 +#ifdef USE_INT80
26026 +extern unsigned char syscall32_int80[], syscall32_int80_end[];
26027 +#endif
26028 +extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
26029 +extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
26030 +extern int sysctl_vsyscall32;
26031 +
26032 +char *syscall32_page; 
26033 +#ifndef USE_INT80
26034 +static int use_sysenter = -1;
26035 +#endif
26036 +
26037 +static struct page *
26038 +syscall32_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
26039 +{
26040 +       struct page *p = virt_to_page(adr - vma->vm_start + syscall32_page);
26041 +       get_page(p);
26042 +       return p;
26043 +}
26044 +
26045 +/* Prevent VMA merging */
26046 +static void syscall32_vma_close(struct vm_area_struct *vma)
26047 +{
26048 +}
26049 +
26050 +static struct vm_operations_struct syscall32_vm_ops = {
26051 +       .close = syscall32_vma_close,
26052 +       .nopage = syscall32_nopage,
26053 +};
26054 +
26055 +struct linux_binprm;
26056 +
26057 +/* Setup a VMA at program startup for the vsyscall page */
26058 +int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
26059 +{
26060 +       int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
26061 +       struct vm_area_struct *vma;
26062 +       struct mm_struct *mm = current->mm;
26063 +       int ret;
26064 +
26065 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
26066 +       if (!vma)
26067 +               return -ENOMEM;
26068 +
26069 +       memset(vma, 0, sizeof(struct vm_area_struct));
26070 +       /* Could randomize here */
26071 +       vma->vm_start = VSYSCALL32_BASE;
26072 +       vma->vm_end = VSYSCALL32_END;
26073 +       /* MAYWRITE to allow gdb to COW and set breakpoints */
26074 +       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
26075 +       vma->vm_flags |= mm->def_flags;
26076 +       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
26077 +       vma->vm_ops = &syscall32_vm_ops;
26078 +       vma->vm_mm = mm;
26079 +
26080 +       down_write(&mm->mmap_sem);
26081 +       if ((ret = insert_vm_struct(mm, vma))) {
26082 +               up_write(&mm->mmap_sem);
26083 +               kmem_cache_free(vm_area_cachep, vma);
26084 +               return ret;
26085 +       }
26086 +       mm->total_vm += npages;
26087 +       up_write(&mm->mmap_sem);
26088 +       return 0;
26089 +}
26090 +
26091 +static int __init init_syscall32(void)
26092 +{ 
26093 +       syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); 
26094 +       if (!syscall32_page) 
26095 +               panic("Cannot allocate syscall32 page"); 
26096 +
26097 +#ifdef USE_INT80
26098 +       /*
26099 +        * At this point we use int 0x80.
26100 +        */
26101 +       memcpy(syscall32_page, syscall32_int80,
26102 +              syscall32_int80_end - syscall32_int80);
26103 +#else
26104 +       if (use_sysenter > 0) {
26105 +               memcpy(syscall32_page, syscall32_sysenter,
26106 +                      syscall32_sysenter_end - syscall32_sysenter);
26107 +       } else {
26108 +               memcpy(syscall32_page, syscall32_syscall,
26109 +                      syscall32_syscall_end - syscall32_syscall);
26110 +       }       
26111 +#endif
26112 +       return 0;
26113 +} 
26114 +
26115 +/*
26116 + * This must be done early in case we have an initrd containing 32-bit
26117 + * binaries (e.g., hotplug). This could be pushed upstream to arch/x86_64.
26118 + */    
26119 +core_initcall(init_syscall32); 
26120 +
26121 +/* May not be __init: called during resume */
26122 +void syscall32_cpu_init(void)
26123 +{
26124 +#ifndef USE_INT80
26125 +       if (use_sysenter < 0)
26126 +               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
26127 +
26128 +       /* Load these always in case some future AMD CPU supports
26129 +          SYSENTER from compat mode too. */
26130 +       checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
26131 +       checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
26132 +       checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
26133 +
26134 +       wrmsrl(MSR_CSTAR, ia32_cstar_target);
26135 +#endif
26136 +}
26137 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/ia32/vsyscall-int80.S tmp-linux-2.6-xen.patch/arch/x86_64/ia32/vsyscall-int80.S
26138 --- ref-linux-2.6.16.9/arch/x86_64/ia32/vsyscall-int80.S        1970-01-01 01:00:00.000000000 +0100
26139 +++ tmp-linux-2.6-xen.patch/arch/x86_64/ia32/vsyscall-int80.S   2006-04-10 00:05:52.000000000 +0200
26140 @@ -0,0 +1,58 @@
26141 +/*
26142 + * Code for the vsyscall page.  This version uses the old int $0x80 method.
26143 + *
26144 + * NOTE:
26145 + * 1) __kernel_vsyscall _must_ be first in this page.
26146 + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
26147 + *    for details.
26148 + */
26149 +#include <asm/ia32_unistd.h>
26150 +#include <asm/asm-offsets.h>
26151 +
26152 +       .code32
26153 +       .text
26154 +       .section .text.vsyscall,"ax"
26155 +       .globl __kernel_vsyscall
26156 +       .type __kernel_vsyscall,@function
26157 +__kernel_vsyscall:
26158 +.LSTART_vsyscall:
26159 +       int $0x80
26160 +       ret
26161 +.LEND_vsyscall:
26162 +       .size __kernel_vsyscall,.-.LSTART_vsyscall
26163 +       .previous
26164 +
26165 +       .section .eh_frame,"a",@progbits
26166 +.LSTARTFRAME:
26167 +       .long .LENDCIE-.LSTARTCIE
26168 +.LSTARTCIE:
26169 +       .long 0                 /* CIE ID */
26170 +       .byte 1                 /* Version number */
26171 +       .string "zR"            /* NUL-terminated augmentation string */
26172 +       .uleb128 1              /* Code alignment factor */
26173 +       .sleb128 -4             /* Data alignment factor */
26174 +       .byte 8                 /* Return address register column */
26175 +       .uleb128 1              /* Augmentation value length */
26176 +       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
26177 +       .byte 0x0c              /* DW_CFA_def_cfa */
26178 +       .uleb128 4
26179 +       .uleb128 4
26180 +       .byte 0x88              /* DW_CFA_offset, column 0x8 */
26181 +       .uleb128 1
26182 +       .align 4
26183 +.LENDCIE:
26184 +
26185 +       .long .LENDFDE1-.LSTARTFDE1     /* Length FDE */
26186 +.LSTARTFDE1:
26187 +       .long .LSTARTFDE1-.LSTARTFRAME  /* CIE pointer */
26188 +       .long .LSTART_vsyscall-.        /* PC-relative start address */
26189 +       .long .LEND_vsyscall-.LSTART_vsyscall
26190 +       .uleb128 0                      /* Augmentation length */
26191 +       .align 4
26192 +.LENDFDE1:
26193 +               
26194 +/*
26195 + * Get the common code for the sigreturn entry points.
26196 + */
26197 +#define SYSCALL_ENTER_KERNEL    int $0x80
26198 +#include "vsyscall-sigreturn.S"
26199 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/ia32/vsyscall-sigreturn.S tmp-linux-2.6-xen.patch/arch/x86_64/ia32/vsyscall-sigreturn.S
26200 --- ref-linux-2.6.16.9/arch/x86_64/ia32/vsyscall-sigreturn.S    2006-04-19 08:10:14.000000000 +0200
26201 +++ tmp-linux-2.6-xen.patch/arch/x86_64/ia32/vsyscall-sigreturn.S       2006-04-10 00:05:52.000000000 +0200
26202 @@ -120,5 +120,5 @@ __kernel_rt_sigreturn:
26203         .align 4
26204  .LENDFDE3:
26205  
26206 -#include "../../i386/kernel/vsyscall-note.S"
26207 +#include <vsyscall-note.S>
26208  
26209 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/Kconfig tmp-linux-2.6-xen.patch/arch/x86_64/Kconfig
26210 --- ref-linux-2.6.16.9/arch/x86_64/Kconfig      2006-04-19 08:10:14.000000000 +0200
26211 +++ tmp-linux-2.6-xen.patch/arch/x86_64/Kconfig 2006-04-10 00:05:52.000000000 +0200
26212 @@ -119,6 +119,22 @@ config GENERIC_CPU
26213  
26214  endchoice
26215  
26216 +config X86_64_XEN
26217 +       bool "Enable Xen compatible kernel"
26218 +       select SWIOTLB
26219 +       help
26220 +         This option will compile a kernel compatible with Xen hypervisor
26221 +
26222 +config X86_NO_TSS
26223 +       bool
26224 +       depends on X86_64_XEN
26225 +       default y
26226 +
26227 +config X86_NO_IDT
26228 +       bool
26229 +       depends on X86_64_XEN
26230 +       default y
26231 +
26232  #
26233  # Define implied options from the CPU selection here
26234  #
26235 @@ -134,6 +150,7 @@ config X86_L1_CACHE_SHIFT
26236  
26237  config X86_TSC
26238         bool
26239 +       depends on !X86_64_XEN
26240         default y
26241  
26242  config X86_GOOD_APIC
26243 @@ -176,7 +193,7 @@ config X86_CPUID
26244  
26245  config X86_HT
26246         bool
26247 -       depends on SMP && !MK8
26248 +       depends on SMP && !MK8 && !X86_64_XEN
26249         default y
26250  
26251  config MATH_EMULATION
26252 @@ -190,14 +207,22 @@ config EISA
26253  
26254  config X86_IO_APIC
26255         bool
26256 +       depends !XEN_UNPRIVILEGED_GUEST
26257         default y
26258  
26259 +config X86_XEN_GENAPIC
26260 +       bool
26261 +       depends X86_64_XEN
26262 +       default XEN_PRIVILEGED_GUEST || SMP
26263 +
26264  config X86_LOCAL_APIC
26265         bool
26266 +       depends !XEN_UNPRIVILEGED_GUEST
26267         default y
26268  
26269  config MTRR
26270         bool "MTRR (Memory Type Range Register) support"
26271 +       depends on !XEN_UNPRIVILEGED_GUEST
26272         ---help---
26273           On Intel P6 family processors (Pentium Pro, Pentium II and later)
26274           the Memory Type Range Registers (MTRRs) may be used to control
26275 @@ -238,7 +263,7 @@ config SMP
26276  
26277  config SCHED_SMT
26278         bool "SMT (Hyperthreading) scheduler support"
26279 -       depends on SMP
26280 +       depends on SMP && !X86_64_XEN
26281         default n
26282         help
26283           SMT scheduler support improves the CPU scheduler's decision making
26284 @@ -250,7 +275,7 @@ source "kernel/Kconfig.preempt"
26285  
26286  config NUMA
26287         bool "Non Uniform Memory Access (NUMA) Support"
26288 -       depends on SMP
26289 +       depends on SMP && !X86_64_XEN
26290         help
26291          Enable NUMA (Non Uniform Memory Access) support. The kernel 
26292          will try to allocate memory used by a CPU on the local memory 
26293 @@ -325,6 +350,7 @@ config NR_CPUS
26294         int "Maximum number of CPUs (2-256)"
26295         range 2 256
26296         depends on SMP
26297 +       default "16" if X86_64_XEN
26298         default "8"
26299         help
26300           This allows you to specify the maximum number of CPUs which this
26301 @@ -345,6 +371,7 @@ config HOTPLUG_CPU
26302  
26303  config HPET_TIMER
26304         bool
26305 +       depends on !X86_64_XEN
26306         default y
26307         help
26308           Use the IA-PC HPET (High Precision Event Timer) to manage
26309 @@ -362,7 +389,7 @@ config GART_IOMMU
26310         bool "K8 GART IOMMU support"
26311         default y
26312         select SWIOTLB
26313 -       depends on PCI
26314 +       depends on PCI && !X86_64_XEN
26315         help
26316           Support the IOMMU. Needed to run systems with more than 3GB of memory
26317           properly with 32-bit PCI devices that do not support DAC (Double Address
26318 @@ -380,6 +407,7 @@ config SWIOTLB
26319  
26320  config X86_MCE
26321         bool "Machine check support" if EMBEDDED
26322 +       depends on !X86_64_XEN
26323         default y
26324         help
26325            Include a machine check error handler to report hardware errors.
26326 @@ -405,7 +433,7 @@ config X86_MCE_AMD
26327  
26328  config KEXEC
26329         bool "kexec system call (EXPERIMENTAL)"
26330 -       depends on EXPERIMENTAL
26331 +       depends on EXPERIMENTAL && !X86_64_XEN
26332         help
26333           kexec is a system call that implements the ability to shutdown your
26334           current kernel, and to start another kernel.  It is like a reboot
26335 @@ -488,8 +516,11 @@ config GENERIC_PENDING_IRQ
26336         default y
26337  
26338  menu "Power management options"
26339 +       depends on !XEN_UNPRIVILEGED_GUEST
26340  
26341 +if !X86_64_XEN
26342  source kernel/power/Kconfig
26343 +endif
26344  
26345  source "drivers/acpi/Kconfig"
26346  
26347 @@ -512,6 +543,21 @@ config PCI_MMCONFIG
26348         bool "Support mmconfig PCI config space access"
26349         depends on PCI && ACPI
26350  
26351 +config XEN_PCIDEV_FRONTEND
26352 +       bool "Xen PCI Frontend"
26353 +       depends on PCI && X86_64_XEN
26354 +       default y
26355 +       help
26356 +         The PCI device frontend driver allows the kernel to import arbitrary
26357 +         PCI devices from a PCI backend to support PCI driver domains.
26358 +
26359 +config XEN_PCIDEV_FE_DEBUG
26360 +       bool "Xen PCI Frontend Debugging"
26361 +       depends on XEN_PCIDEV_FRONTEND
26362 +       default n
26363 +       help
26364 +         Enables some debug statements within the PCI Frontend.
26365 +
26366  config UNORDERED_IO
26367         bool "Unordered IO mapping access"
26368         depends on EXPERIMENTAL
26369 @@ -522,6 +568,7 @@ config UNORDERED_IO
26370          from i386. Requires that the driver writer used memory barriers
26371          properly.
26372  
26373 +if !X86_64_XEN
26374  source "drivers/pci/pcie/Kconfig"
26375  
26376  source "drivers/pci/Kconfig"
26377 @@ -529,6 +576,7 @@ source "drivers/pci/Kconfig"
26378  source "drivers/pcmcia/Kconfig"
26379  
26380  source "drivers/pci/hotplug/Kconfig"
26381 +endif
26382  
26383  endmenu
26384  
26385 @@ -592,4 +640,6 @@ source "security/Kconfig"
26386  
26387  source "crypto/Kconfig"
26388  
26389 +source "drivers/xen/Kconfig"
26390 +
26391  source "lib/Kconfig"
26392 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/acpi/Makefile tmp-linux-2.6-xen.patch/arch/x86_64/kernel/acpi/Makefile
26393 --- ref-linux-2.6.16.9/arch/x86_64/kernel/acpi/Makefile 2006-04-19 08:10:14.000000000 +0200
26394 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/acpi/Makefile    2006-04-10 00:05:52.000000000 +0200
26395 @@ -6,3 +6,4 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),)
26396  obj-y                  += processor.o
26397  endif
26398  
26399 +boot-$(CONFIG_XEN)             := ../../../i386/kernel/acpi/boot-xen.o
26400 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/apic-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/apic-xen.c
26401 --- ref-linux-2.6.16.9/arch/x86_64/kernel/apic-xen.c    1970-01-01 01:00:00.000000000 +0100
26402 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/apic-xen.c       2006-04-10 00:05:52.000000000 +0200
26403 @@ -0,0 +1,198 @@
26404 +/*
26405 + *     Local APIC handling, local APIC timers
26406 + *
26407 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
26408 + *
26409 + *     Fixes
26410 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
26411 + *                                     thanks to Eric Gilmore
26412 + *                                     and Rolf G. Tews
26413 + *                                     for testing these extensively.
26414 + *     Maciej W. Rozycki       :       Various updates and fixes.
26415 + *     Mikael Pettersson       :       Power Management for UP-APIC.
26416 + *     Pavel Machek and
26417 + *     Mikael Pettersson       :       PM converted to driver model.
26418 + */
26419 +
26420 +#include <linux/config.h>
26421 +#include <linux/init.h>
26422 +
26423 +#include <linux/mm.h>
26424 +#include <linux/delay.h>
26425 +#include <linux/bootmem.h>
26426 +#include <linux/smp_lock.h>
26427 +#include <linux/interrupt.h>
26428 +#include <linux/mc146818rtc.h>
26429 +#include <linux/kernel_stat.h>
26430 +#include <linux/sysdev.h>
26431 +#include <linux/module.h>
26432 +
26433 +#include <asm/atomic.h>
26434 +#include <asm/smp.h>
26435 +#include <asm/mtrr.h>
26436 +#include <asm/mpspec.h>
26437 +#include <asm/desc.h>
26438 +#include <asm/arch_hooks.h>
26439 +#include <asm/hpet.h>
26440 +#include <asm/idle.h>
26441 +
26442 +int apic_verbosity;
26443 +
26444 +/*
26445 + * 'what should we do if we get a hw irq event on an illegal vector'.
26446 + * each architecture has to answer this themselves.
26447 + */
26448 +void ack_bad_irq(unsigned int irq)
26449 +{
26450 +       printk("unexpected IRQ trap at vector %02x\n", irq);
26451 +       /*
26452 +        * Currently unexpected vectors happen only on SMP and APIC.
26453 +        * We _must_ ack these because every local APIC has only N
26454 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
26455 +        * holds up an irq slot - in excessive cases (when multiple
26456 +        * unexpected vectors occur) that might lock up the APIC
26457 +        * completely.
26458 +        * But don't ack when the APIC is disabled. -AK
26459 +        */
26460 +       if (!disable_apic)
26461 +               ack_APIC_irq();
26462 +}
26463 +
26464 +int setup_profiling_timer(unsigned int multiplier)
26465 +{
26466 +       return -EINVAL;
26467 +}
26468 +
26469 +void smp_local_timer_interrupt(struct pt_regs *regs)
26470 +{
26471 +       profile_tick(CPU_PROFILING, regs);
26472 +#ifndef CONFIG_XEN
26473 +#ifdef CONFIG_SMP
26474 +               update_process_times(user_mode(regs));
26475 +#endif
26476 +#endif
26477 +       /*
26478 +        * We take the 'long' return path, and there every subsystem
26479 +        * grabs the appropriate locks (kernel lock/ irq lock).
26480 +        *
26481 +        * we might want to decouple profiling from the 'long path',
26482 +        * and do the profiling totally in assembly.
26483 +        *
26484 +        * Currently this isn't too much of an issue (performance wise),
26485 +        * we can take more than 100K local irqs per second on a 100 MHz P5.
26486 +        */
26487 +}
26488 +
26489 +/*
26490 + * Local APIC timer interrupt. This is the most natural way for doing
26491 + * local interrupts, but local timer interrupts can be emulated by
26492 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
26493 + *
26494 + * [ if a single-CPU system runs an SMP kernel then we call the local
26495 + *   interrupt as well. Thus we cannot inline the local irq ... ]
26496 + */
26497 +void smp_apic_timer_interrupt(struct pt_regs *regs)
26498 +{
26499 +       /*
26500 +        * the NMI deadlock-detector uses this.
26501 +        */
26502 +       add_pda(apic_timer_irqs, 1);
26503 +
26504 +       /*
26505 +        * NOTE! We'd better ACK the irq immediately,
26506 +        * because timer handling can be slow.
26507 +        */
26508 +       ack_APIC_irq();
26509 +       /*
26510 +        * update_process_times() expects us to have done irq_enter().
26511 +        * Besides, if we don't timer interrupts ignore the global
26512 +        * interrupt lock, which is the WrongThing (tm) to do.
26513 +        */
26514 +       exit_idle();
26515 +       irq_enter();
26516 +       smp_local_timer_interrupt(regs);
26517 +       irq_exit();
26518 +}
26519 +
26520 +/*
26521 + * This interrupt should _never_ happen with our APIC/SMP architecture
26522 + */
26523 +asmlinkage void smp_spurious_interrupt(void)
26524 +{
26525 +       unsigned int v;
26526 +       exit_idle();
26527 +       irq_enter();
26528 +       /*
26529 +        * Check if this really is a spurious interrupt and ACK it
26530 +        * if it is a vectored one.  Just in case...
26531 +        * Spurious interrupts should not be ACKed.
26532 +        */
26533 +       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
26534 +       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
26535 +               ack_APIC_irq();
26536 +
26537 +#if 0
26538 +       static unsigned long last_warning; 
26539 +       static unsigned long skipped; 
26540 +
26541 +       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
26542 +       if (time_before(last_warning+30*HZ,jiffies)) { 
26543 +               printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
26544 +                      smp_processor_id(), skipped);
26545 +               last_warning = jiffies; 
26546 +               skipped = 0;
26547 +       } else { 
26548 +               skipped++; 
26549 +       } 
26550 +#endif 
26551 +       irq_exit();
26552 +}
26553 +
26554 +/*
26555 + * This interrupt should never happen with our APIC/SMP architecture
26556 + */
26557 +
26558 +asmlinkage void smp_error_interrupt(void)
26559 +{
26560 +       unsigned int v, v1;
26561 +
26562 +       exit_idle();
26563 +       irq_enter();
26564 +       /* First tickle the hardware, only then report what went on. -- REW */
26565 +       v = apic_read(APIC_ESR);
26566 +       apic_write(APIC_ESR, 0);
26567 +       v1 = apic_read(APIC_ESR);
26568 +       ack_APIC_irq();
26569 +       atomic_inc(&irq_err_count);
26570 +
26571 +       /* Here is what the APIC error bits mean:
26572 +          0: Send CS error
26573 +          1: Receive CS error
26574 +          2: Send accept error
26575 +          3: Receive accept error
26576 +          4: Reserved
26577 +          5: Send illegal vector
26578 +          6: Received illegal vector
26579 +          7: Illegal register address
26580 +       */
26581 +       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
26582 +               smp_processor_id(), v , v1);
26583 +       irq_exit();
26584 +}
26585 +
26586 +int disable_apic;
26587 +
26588 +/*
26589 + * This initializes the IO-APIC and APIC hardware if this is
26590 + * a UP kernel.
26591 + */
26592 +int __init APIC_init_uniprocessor (void)
26593 +{
26594 +#ifdef CONFIG_X86_IO_APIC
26595 +       if (smp_found_config)
26596 +               if (!skip_ioapic_setup && nr_ioapics)
26597 +                       setup_IO_APIC();
26598 +#endif
26599 +
26600 +       return 0;
26601 +}
26602 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/asm-offsets.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/asm-offsets.c
26603 --- ref-linux-2.6.16.9/arch/x86_64/kernel/asm-offsets.c 2006-04-19 08:10:14.000000000 +0200
26604 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/asm-offsets.c    2006-04-10 00:05:52.000000000 +0200
26605 @@ -66,7 +66,9 @@ int main(void)
26606         DEFINE(pbe_address, offsetof(struct pbe, address));
26607         DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
26608         DEFINE(pbe_next, offsetof(struct pbe, next));
26609 +#ifndef CONFIG_X86_NO_TSS
26610         BLANK();
26611         DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
26612 +#endif
26613         return 0;
26614  }
26615 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/e820-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/e820-xen.c
26616 --- ref-linux-2.6.16.9/arch/x86_64/kernel/e820-xen.c    1970-01-01 01:00:00.000000000 +0100
26617 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/e820-xen.c       2006-04-10 00:05:52.000000000 +0200
26618 @@ -0,0 +1,744 @@
26619 +/* 
26620 + * Handle the memory map.
26621 + * The functions here do the job until bootmem takes over.
26622 + * $Id$
26623 + *
26624 + *  Getting sanitize_e820_map() in sync with i386 version by applying change:
26625 + *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
26626 + *     Alex Achenbach <xela@slit.de>, December 2002.
26627 + *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
26628 + *
26629 + */
26630 +#include <linux/config.h>
26631 +#include <linux/kernel.h>
26632 +#include <linux/types.h>
26633 +#include <linux/init.h>
26634 +#include <linux/bootmem.h>
26635 +#include <linux/ioport.h>
26636 +#include <linux/string.h>
26637 +#include <linux/kexec.h>
26638 +#include <linux/module.h>
26639 +
26640 +#include <asm/page.h>
26641 +#include <asm/e820.h>
26642 +#include <asm/proto.h>
26643 +#include <asm/bootsetup.h>
26644 +#include <asm/sections.h>
26645 +#include <xen/interface/memory.h>
26646 +
26647 +unsigned long pci_mem_start = 0xaeedbabe;
26648 +
26649 +/* 
26650 + * PFN of last memory page.
26651 + */
26652 +unsigned long end_pfn; 
26653 +EXPORT_SYMBOL(end_pfn);
26654 +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;  
26655 +unsigned long end_pfn_map; 
26656 +
26657 +/* 
26658 + * Add a memory region to the kernel e820 map.
26659 + */ 
26660 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
26661 +{
26662 +       int x = e820.nr_map;
26663 +
26664 +       if (x == E820MAX) {
26665 +               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
26666 +               return;
26667 +       }
26668 +
26669 +       e820.map[x].addr = start;
26670 +       e820.map[x].size = size;
26671 +       e820.map[x].type = type;
26672 +       e820.nr_map++;
26673 +}
26674 +
26675 +#ifndef CONFIG_XEN
26676 +
26677 +/* 
26678 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
26679 + * The direct mapping extends to end_pfn_map, so that we can directly access
26680 + * apertures, ACPI and other tables without having to play with fixmaps.
26681 + */ 
26682 +
26683 +/* 
26684 + * Last pfn which the user wants to use.
26685 + */
26686 +
26687 +extern struct resource code_resource, data_resource;
26688 +
26689 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
26690 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
26691 +{ 
26692 +       unsigned long addr = *addrp, last = addr + size; 
26693 +
26694 +       /* various gunk below that needed for SMP startup */
26695 +       if (addr < 0x8000) { 
26696 +               *addrp = 0x8000;
26697 +               return 1; 
26698 +       }
26699 +
26700 +       /* direct mapping tables of the kernel */
26701 +       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
26702 +               *addrp = table_end << PAGE_SHIFT; 
26703 +               return 1;
26704 +       } 
26705 +
26706 +       /* initrd */ 
26707 +#ifdef CONFIG_BLK_DEV_INITRD
26708 +       if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
26709 +           addr < INITRD_START+INITRD_SIZE) { 
26710 +               *addrp = INITRD_START + INITRD_SIZE; 
26711 +               return 1;
26712 +       } 
26713 +#endif
26714 +       /* kernel code + 640k memory hole (later should not be needed, but 
26715 +          be paranoid for now) */
26716 +       if (last >= 640*1024 && addr < __pa_symbol(&_end)) { 
26717 +               *addrp = __pa_symbol(&_end);
26718 +               return 1;
26719 +       }
26720 +       /* XXX ramdisk image here? */ 
26721 +       return 0;
26722 +} 
26723 +
26724 +int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) 
26725 +{ 
26726 +       int i;
26727 +       for (i = 0; i < e820.nr_map; i++) { 
26728 +               struct e820entry *ei = &e820.map[i]; 
26729 +               if (type && ei->type != type) 
26730 +                       continue;
26731 +               if (ei->addr >= end || ei->addr + ei->size <= start)
26732 +                       continue; 
26733 +               return 1; 
26734 +       } 
26735 +       return 0;
26736 +}
26737 +
26738 +/* 
26739 + * Find a free area in a specific range. 
26740 + */ 
26741 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
26742 +{ 
26743 +       int i; 
26744 +       for (i = 0; i < e820.nr_map; i++) { 
26745 +               struct e820entry *ei = &e820.map[i]; 
26746 +               unsigned long addr = ei->addr, last; 
26747 +               if (ei->type != E820_RAM) 
26748 +                       continue; 
26749 +               if (addr < start) 
26750 +                       addr = start;
26751 +               if (addr > ei->addr + ei->size) 
26752 +                       continue; 
26753 +               while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
26754 +                       ;
26755 +               last = addr + size;
26756 +               if (last > ei->addr + ei->size)
26757 +                       continue;
26758 +               if (last > end) 
26759 +                       continue;
26760 +               return addr; 
26761 +       } 
26762 +       return -1UL;            
26763 +} 
26764 +
26765 +/* 
26766 + * Free bootmem based on the e820 table for a node.
26767 + */
26768 +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
26769 +{
26770 +       int i;
26771 +       for (i = 0; i < e820.nr_map; i++) {
26772 +               struct e820entry *ei = &e820.map[i]; 
26773 +               unsigned long last, addr;
26774 +
26775 +               if (ei->type != E820_RAM || 
26776 +                   ei->addr+ei->size <= start || 
26777 +                   ei->addr >= end)
26778 +                       continue;
26779 +
26780 +               addr = round_up(ei->addr, PAGE_SIZE);
26781 +               if (addr < start) 
26782 +                       addr = start;
26783 +
26784 +               last = round_down(ei->addr + ei->size, PAGE_SIZE); 
26785 +               if (last >= end)
26786 +                       last = end; 
26787 +
26788 +               if (last > addr && last-addr >= PAGE_SIZE)
26789 +                       free_bootmem_node(pgdat, addr, last-addr);
26790 +       }
26791 +}
26792 +
26793 +/*
26794 + * Find the highest page frame number we have available
26795 + */
26796 +unsigned long __init e820_end_of_ram(void)
26797 +{
26798 +       int i;
26799 +       unsigned long end_pfn = 0;
26800 +       
26801 +       for (i = 0; i < e820.nr_map; i++) {
26802 +               struct e820entry *ei = &e820.map[i]; 
26803 +               unsigned long start, end;
26804 +
26805 +               start = round_up(ei->addr, PAGE_SIZE); 
26806 +               end = round_down(ei->addr + ei->size, PAGE_SIZE); 
26807 +               if (start >= end)
26808 +                       continue;
26809 +               if (ei->type == E820_RAM) { 
26810 +               if (end > end_pfn<<PAGE_SHIFT)
26811 +                       end_pfn = end>>PAGE_SHIFT;
26812 +               } else { 
26813 +                       if (end > end_pfn_map<<PAGE_SHIFT) 
26814 +                               end_pfn_map = end>>PAGE_SHIFT;
26815 +               } 
26816 +       }
26817 +
26818 +       if (end_pfn > end_pfn_map) 
26819 +               end_pfn_map = end_pfn;
26820 +       if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
26821 +               end_pfn_map = MAXMEM>>PAGE_SHIFT;
26822 +       if (end_pfn > end_user_pfn)
26823 +               end_pfn = end_user_pfn;
26824 +       if (end_pfn > end_pfn_map) 
26825 +               end_pfn = end_pfn_map; 
26826 +
26827 +       return end_pfn; 
26828 +}
26829 +
26830 +/* 
26831 + * Compute how much memory is missing in a range.
26832 + * Unlike the other functions in this file the arguments are in page numbers.
26833 + */
26834 +unsigned long __init
26835 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
26836 +{
26837 +       unsigned long ram = 0;
26838 +       unsigned long start = start_pfn << PAGE_SHIFT;
26839 +       unsigned long end = end_pfn << PAGE_SHIFT;
26840 +       int i;
26841 +       for (i = 0; i < e820.nr_map; i++) {
26842 +               struct e820entry *ei = &e820.map[i];
26843 +               unsigned long last, addr;
26844 +
26845 +               if (ei->type != E820_RAM ||
26846 +                   ei->addr+ei->size <= start ||
26847 +                   ei->addr >= end)
26848 +                       continue;
26849 +
26850 +               addr = round_up(ei->addr, PAGE_SIZE);
26851 +               if (addr < start)
26852 +                       addr = start;
26853 +
26854 +               last = round_down(ei->addr + ei->size, PAGE_SIZE);
26855 +               if (last >= end)
26856 +                       last = end;
26857 +
26858 +               if (last > addr)
26859 +                       ram += last - addr;
26860 +       }
26861 +       return ((end - start) - ram) >> PAGE_SHIFT;
26862 +}
26863 +
26864 +/*
26865 + * Mark e820 reserved areas as busy for the resource manager.
26866 + */
26867 +void __init e820_reserve_resources(void)
26868 +{
26869 +       int i;
26870 +       for (i = 0; i < e820.nr_map; i++) {
26871 +               struct resource *res;
26872 +               res = alloc_bootmem_low(sizeof(struct resource));
26873 +               switch (e820.map[i].type) {
26874 +               case E820_RAM:  res->name = "System RAM"; break;
26875 +               case E820_ACPI: res->name = "ACPI Tables"; break;
26876 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
26877 +               default:        res->name = "reserved";
26878 +               }
26879 +               res->start = e820.map[i].addr;
26880 +               res->end = res->start + e820.map[i].size - 1;
26881 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
26882 +               request_resource(&iomem_resource, res);
26883 +               if (e820.map[i].type == E820_RAM) {
26884 +                       /*
26885 +                        *  We don't know which RAM region contains kernel data,
26886 +                        *  so we try it repeatedly and let the resource manager
26887 +                        *  test it.
26888 +                        */
26889 +                       request_resource(res, &code_resource);
26890 +                       request_resource(res, &data_resource);
26891 +#ifdef CONFIG_KEXEC
26892 +                       request_resource(res, &crashk_res);
26893 +#endif
26894 +               }
26895 +       }
26896 +}
26897 +#endif /* CONFIG_XEN */
26898 +
26899 +void __init e820_print_map(char *who)
26900 +{
26901 +       int i;
26902 +
26903 +       for (i = 0; i < e820.nr_map; i++) {
26904 +               printk(" %s: %016Lx - %016Lx ", who,
26905 +                       (unsigned long long) e820.map[i].addr,
26906 +                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
26907 +               switch (e820.map[i].type) {
26908 +               case E820_RAM:  printk("(usable)\n");
26909 +                               break;
26910 +               case E820_RESERVED:
26911 +                               printk("(reserved)\n");
26912 +                               break;
26913 +               case E820_ACPI:
26914 +                               printk("(ACPI data)\n");
26915 +                               break;
26916 +               case E820_NVS:
26917 +                               printk("(ACPI NVS)\n");
26918 +                               break;
26919 +               default:        printk("type %u\n", e820.map[i].type);
26920 +                               break;
26921 +               }
26922 +       }
26923 +}
26924 +
26925 +#ifndef CONFIG_XEN
26926 +/*
26927 + * Sanitize the BIOS e820 map.
26928 + *
26929 + * Some e820 responses include overlapping entries.  The following 
26930 + * replaces the original e820 map with a new one, removing overlaps.
26931 + *
26932 + */
26933 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
26934 +{
26935 +       struct change_member {
26936 +               struct e820entry *pbios; /* pointer to original bios entry */
26937 +               unsigned long long addr; /* address for this change point */
26938 +       };
26939 +       static struct change_member change_point_list[2*E820MAX] __initdata;
26940 +       static struct change_member *change_point[2*E820MAX] __initdata;
26941 +       static struct e820entry *overlap_list[E820MAX] __initdata;
26942 +       static struct e820entry new_bios[E820MAX] __initdata;
26943 +       struct change_member *change_tmp;
26944 +       unsigned long current_type, last_type;
26945 +       unsigned long long last_addr;
26946 +       int chgidx, still_changing;
26947 +       int overlap_entries;
26948 +       int new_bios_entry;
26949 +       int old_nr, new_nr, chg_nr;
26950 +       int i;
26951 +
26952 +       /*
26953 +               Visually we're performing the following (1,2,3,4 = memory types)...
26954 +
26955 +               Sample memory map (w/overlaps):
26956 +                  ____22__________________
26957 +                  ______________________4_
26958 +                  ____1111________________
26959 +                  _44_____________________
26960 +                  11111111________________
26961 +                  ____________________33__
26962 +                  ___________44___________
26963 +                  __________33333_________
26964 +                  ______________22________
26965 +                  ___________________2222_
26966 +                  _________111111111______
26967 +                  _____________________11_
26968 +                  _________________4______
26969 +
26970 +               Sanitized equivalent (no overlap):
26971 +                  1_______________________
26972 +                  _44_____________________
26973 +                  ___1____________________
26974 +                  ____22__________________
26975 +                  ______11________________
26976 +                  _________1______________
26977 +                  __________3_____________
26978 +                  ___________44___________
26979 +                  _____________33_________
26980 +                  _______________2________
26981 +                  ________________1_______
26982 +                  _________________4______
26983 +                  ___________________2____
26984 +                  ____________________33__
26985 +                  ______________________4_
26986 +       */
26987 +
26988 +       /* if there's only one memory region, don't bother */
26989 +       if (*pnr_map < 2)
26990 +               return -1;
26991 +
26992 +       old_nr = *pnr_map;
26993 +
26994 +       /* bail out if we find any unreasonable addresses in bios map */
26995 +       for (i=0; i<old_nr; i++)
26996 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
26997 +                       return -1;
26998 +
26999 +       /* create pointers for initial change-point information (for sorting) */
27000 +       for (i=0; i < 2*old_nr; i++)
27001 +               change_point[i] = &change_point_list[i];
27002 +
27003 +       /* record all known change-points (starting and ending addresses),
27004 +          omitting those that are for empty memory regions */
27005 +       chgidx = 0;
27006 +       for (i=0; i < old_nr; i++)      {
27007 +               if (biosmap[i].size != 0) {
27008 +                       change_point[chgidx]->addr = biosmap[i].addr;
27009 +                       change_point[chgidx++]->pbios = &biosmap[i];
27010 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
27011 +                       change_point[chgidx++]->pbios = &biosmap[i];
27012 +               }
27013 +       }
27014 +       chg_nr = chgidx;
27015 +
27016 +       /* sort change-point list by memory addresses (low -> high) */
27017 +       still_changing = 1;
27018 +       while (still_changing)  {
27019 +               still_changing = 0;
27020 +               for (i=1; i < chg_nr; i++)  {
27021 +                       /* if <current_addr> > <last_addr>, swap */
27022 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
27023 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
27024 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
27025 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
27026 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
27027 +                          )
27028 +                       {
27029 +                               change_tmp = change_point[i];
27030 +                               change_point[i] = change_point[i-1];
27031 +                               change_point[i-1] = change_tmp;
27032 +                               still_changing=1;
27033 +                       }
27034 +               }
27035 +       }
27036 +
27037 +       /* create a new bios memory map, removing overlaps */
27038 +       overlap_entries=0;       /* number of entries in the overlap table */
27039 +       new_bios_entry=0;        /* index for creating new bios map entries */
27040 +       last_type = 0;           /* start with undefined memory type */
27041 +       last_addr = 0;           /* start with 0 as last starting address */
27042 +       /* loop through change-points, determining affect on the new bios map */
27043 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
27044 +       {
27045 +               /* keep track of all overlapping bios entries */
27046 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
27047 +               {
27048 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
27049 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
27050 +               }
27051 +               else
27052 +               {
27053 +                       /* remove entry from list (order independent, so swap with last) */
27054 +                       for (i=0; i<overlap_entries; i++)
27055 +                       {
27056 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
27057 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
27058 +                       }
27059 +                       overlap_entries--;
27060 +               }
27061 +               /* if there are overlapping entries, decide which "type" to use */
27062 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
27063 +               current_type = 0;
27064 +               for (i=0; i<overlap_entries; i++)
27065 +                       if (overlap_list[i]->type > current_type)
27066 +                               current_type = overlap_list[i]->type;
27067 +               /* continue building up new bios map based on this information */
27068 +               if (current_type != last_type)  {
27069 +                       if (last_type != 0)      {
27070 +                               new_bios[new_bios_entry].size =
27071 +                                       change_point[chgidx]->addr - last_addr;
27072 +                               /* move forward only if the new size was non-zero */
27073 +                               if (new_bios[new_bios_entry].size != 0)
27074 +                                       if (++new_bios_entry >= E820MAX)
27075 +                                               break;  /* no more space left for new bios entries */
27076 +                       }
27077 +                       if (current_type != 0)  {
27078 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
27079 +                               new_bios[new_bios_entry].type = current_type;
27080 +                               last_addr=change_point[chgidx]->addr;
27081 +                       }
27082 +                       last_type = current_type;
27083 +               }
27084 +       }
27085 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
27086 +
27087 +       /* copy new bios mapping into original location */
27088 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
27089 +       *pnr_map = new_nr;
27090 +
27091 +       return 0;
27092 +}
27093 +
27094 +/*
27095 + * Copy the BIOS e820 map into a safe place.
27096 + *
27097 + * Sanity-check it while we're at it..
27098 + *
27099 + * If we're lucky and live on a modern system, the setup code
27100 + * will have given us a memory map that we can use to properly
27101 + * set up memory.  If we aren't, we'll fake a memory map.
27102 + *
27103 + * We check to see that the memory map contains at least 2 elements
27104 + * before we'll use it, because the detection code in setup.S may
27105 + * not be perfect and most every PC known to man has two memory
27106 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
27107 + * thinkpad 560x, for example, does not cooperate with the memory
27108 + * detection code.)
27109 + */
27110 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
27111 +{
27112 +       /* Only one memory region (or negative)? Ignore it */
27113 +       if (nr_map < 2)
27114 +               return -1;
27115 +
27116 +       do {
27117 +               unsigned long start = biosmap->addr;
27118 +               unsigned long size = biosmap->size;
27119 +               unsigned long end = start + size;
27120 +               unsigned long type = biosmap->type;
27121 +
27122 +               /* Overflow in 64 bits? Ignore the memory map. */
27123 +               if (start > end)
27124 +                       return -1;
27125 +
27126 +               /*
27127 +                * Some BIOSes claim RAM in the 640k - 1M region.
27128 +                * Not right. Fix it up.
27129 +                * 
27130 +                * This should be removed on Hammer which is supposed to not
27131 +                * have non e820 covered ISA mappings there, but I had some strange
27132 +                * problems so it stays for now.  -AK
27133 +                */
27134 +               if (type == E820_RAM) {
27135 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
27136 +                               if (start < 0xA0000ULL)
27137 +                                       add_memory_region(start, 0xA0000ULL-start, type);
27138 +                               if (end <= 0x100000ULL)
27139 +                                       continue;
27140 +                               start = 0x100000ULL;
27141 +                               size = end - start;
27142 +                       }
27143 +               }
27144 +
27145 +               add_memory_region(start, size, type);
27146 +       } while (biosmap++,--nr_map);
27147 +       return 0;
27148 +}
27149 +
27150 +void __init setup_memory_region(void)
27151 +{
27152 +       char *who = "BIOS-e820";
27153 +
27154 +       /*
27155 +        * Try to copy the BIOS-supplied E820-map.
27156 +        *
27157 +        * Otherwise fake a memory map; one section from 0k->640k,
27158 +        * the next section from 1mb->appropriate_mem_k
27159 +        */
27160 +       sanitize_e820_map(E820_MAP, &E820_MAP_NR);
27161 +       if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
27162 +               unsigned long mem_size;
27163 +
27164 +               /* compare results from other methods and take the greater */
27165 +               if (ALT_MEM_K < EXT_MEM_K) {
27166 +                       mem_size = EXT_MEM_K;
27167 +                       who = "BIOS-88";
27168 +               } else {
27169 +                       mem_size = ALT_MEM_K;
27170 +                       who = "BIOS-e801";
27171 +               }
27172 +
27173 +               e820.nr_map = 0;
27174 +               add_memory_region(0, LOWMEMSIZE(), E820_RAM);
27175 +               add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
27176 +       }
27177 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
27178 +       e820_print_map(who);
27179 +}
27180 +
27181 +#else  /* CONFIG_XEN */
27182 +
27183 +extern unsigned long xen_override_max_pfn;
27184 +extern union xen_start_info_union xen_start_info_union;
27185 +
27186 +unsigned long __init e820_end_of_ram(void)
27187 +{
27188 +       unsigned long max_end_pfn;
27189 +
27190 +       if (xen_override_max_pfn == 0) {
27191 +               max_end_pfn = xen_start_info->nr_pages;
27192 +               /* Default 8MB slack (to balance backend allocations). */
27193 +               max_end_pfn += 8 << (20 - PAGE_SHIFT);
27194 +       } else if (xen_override_max_pfn > xen_start_info->nr_pages) {
27195 +               max_end_pfn = xen_override_max_pfn;
27196 +       } else {
27197 +               max_end_pfn = xen_start_info->nr_pages;
27198 +       }
27199 +
27200 +       return max_end_pfn;
27201 +}
27202 +
27203 +unsigned long __init
27204 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
27205 +{
27206 +       return 0;
27207 +}
27208 +
27209 +void __init e820_reserve_resources(void) 
27210 +{
27211 +       dom0_op_t op;
27212 +       struct dom0_memory_map_entry *map;
27213 +       unsigned long gapstart, gapsize, round, last;
27214 +       int i, found = 0;
27215 +
27216 +       if (!(xen_start_info->flags & SIF_INITDOMAIN))
27217 +               return;
27218 +
27219 +       map = alloc_bootmem_low_pages(PAGE_SIZE);
27220 +       op.cmd = DOM0_PHYSICAL_MEMORY_MAP;
27221 +       op.u.physical_memory_map.memory_map = map;
27222 +       op.u.physical_memory_map.max_map_entries =
27223 +               PAGE_SIZE / sizeof(struct dom0_memory_map_entry);
27224 +       BUG_ON(HYPERVISOR_dom0_op(&op));
27225 +
27226 +       last = 0x100000000ULL;
27227 +       gapstart = 0x10000000;
27228 +       gapsize = 0x400000;
27229 +
27230 +       for (i = op.u.physical_memory_map.nr_map_entries - 1; i >= 0; i--) {
27231 +               struct resource *res;
27232 +
27233 +               if ((last > map[i].end) && ((last - map[i].end) > gapsize)) {
27234 +                       gapsize = last - map[i].end;
27235 +                       gapstart = map[i].end;
27236 +                       found = 1;
27237 +               }
27238 +               if (map[i].start < last)
27239 +                       last = map[i].start;
27240 +
27241 +               if (map[i].end > 0x100000000ULL)
27242 +                       continue;
27243 +               res = alloc_bootmem_low(sizeof(struct resource));
27244 +               res->name = map[i].is_ram ? "System RAM" : "reserved";
27245 +               res->start = map[i].start;
27246 +               res->end = map[i].end - 1;
27247 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
27248 +               request_resource(&iomem_resource, res);
27249 +       }
27250 +
27251 +       free_bootmem(__pa(map), PAGE_SIZE);
27252 +
27253 +       if (!found) {
27254 +               gapstart = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
27255 +               gapstart = (gapstart << PAGE_SHIFT) + 1024*1024;
27256 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
27257 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
27258 +       }
27259 +
27260 +       /*
27261 +        * See how much we want to round up: start off with
27262 +        * rounding to the next 1MB area.
27263 +        */
27264 +       round = 0x100000;
27265 +       while ((gapsize >> 4) > round)
27266 +               round += round;
27267 +       /* Fun with two's complement */
27268 +       pci_mem_start = (gapstart + round) & -round;
27269 +
27270 +       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
27271 +               pci_mem_start, gapstart, gapsize);
27272 +}
27273 +
27274 +#endif
27275 +
27276 +void __init parse_memopt(char *p, char **from) 
27277 +{ 
27278 +       end_user_pfn = memparse(p, from);
27279 +       end_user_pfn >>= PAGE_SHIFT;    
27280 +       xen_override_max_pfn = (unsigned long) end_user_pfn;
27281 +} 
27282 +
27283 +void __init parse_memmapopt(char *p, char **from)
27284 +{
27285 +       unsigned long long start_at, mem_size;
27286 +
27287 +       mem_size = memparse(p, from);
27288 +       p = *from;
27289 +       if (*p == '@') {
27290 +               start_at = memparse(p+1, from);
27291 +               add_memory_region(start_at, mem_size, E820_RAM);
27292 +       } else if (*p == '#') {
27293 +               start_at = memparse(p+1, from);
27294 +               add_memory_region(start_at, mem_size, E820_ACPI);
27295 +       } else if (*p == '$') {
27296 +               start_at = memparse(p+1, from);
27297 +               add_memory_region(start_at, mem_size, E820_RESERVED);
27298 +       } else {
27299 +               end_user_pfn = (mem_size >> PAGE_SHIFT);
27300 +       }
27301 +       p = *from;
27302 +}
27303 +
27304 +/*
27305 + * Search for the biggest gap in the low 32 bits of the e820
27306 + * memory space.  We pass this space to PCI to assign MMIO resources
27307 + * for hotplug or unconfigured devices in.
27308 + * Hopefully the BIOS let enough space left.
27309 + */
27310 +__init void e820_setup_gap(void)
27311 +{
27312 +#ifndef CONFIG_XEN
27313 +       unsigned long gapstart, gapsize;
27314 +       unsigned long last;
27315 +       int i;
27316 +       int found = 0;
27317 +
27318 +       last = 0x100000000ull;
27319 +       gapstart = 0x10000000;
27320 +       gapsize = 0x400000;
27321 +       i = e820.nr_map;
27322 +       while (--i >= 0) {
27323 +               unsigned long long start = e820.map[i].addr;
27324 +               unsigned long long end = start + e820.map[i].size;
27325 +
27326 +               /*
27327 +                * Since "last" is at most 4GB, we know we'll
27328 +                * fit in 32 bits if this condition is true
27329 +                */
27330 +               if (last > end) {
27331 +                       unsigned long gap = last - end;
27332 +
27333 +                       if (gap > gapsize) {
27334 +                               gapsize = gap;
27335 +                               gapstart = end;
27336 +                               found = 1;
27337 +                       }
27338 +               }
27339 +               if (start < last)
27340 +                       last = start;
27341 +       }
27342 +
27343 +       if (!found) {
27344 +               gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
27345 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
27346 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
27347 +       }
27348 +
27349 +       /*
27350 +        * Start allocating dynamic PCI memory a bit into the gap,
27351 +        * aligned up to the nearest megabyte.
27352 +        *
27353 +        * Question: should we try to pad it up a bit (do something
27354 +        * like " + (gapsize >> 3)" in there too?). We now have the
27355 +        * technology.
27356 +        */
27357 +       pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
27358 +
27359 +       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
27360 +               pci_mem_start, gapstart, gapsize);
27361 +#endif
27362 +}
27363 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/early_printk-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/early_printk-xen.c
27364 --- ref-linux-2.6.16.9/arch/x86_64/kernel/early_printk-xen.c    1970-01-01 01:00:00.000000000 +0100
27365 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/early_printk-xen.c       2006-04-10 00:05:52.000000000 +0200
27366 @@ -0,0 +1,306 @@
27367 +#include <linux/config.h>
27368 +#include <linux/console.h>
27369 +#include <linux/kernel.h>
27370 +#include <linux/init.h>
27371 +#include <linux/string.h>
27372 +#include <linux/tty.h>
27373 +#include <asm/io.h>
27374 +#include <asm/processor.h>
27375 +#include <asm/fcntl.h>
27376 +
27377 +/* Simple VGA output */
27378 +
27379 +#ifdef __i386__
27380 +#include <asm/setup.h>
27381 +#define VGABASE                (__ISA_IO_base + 0xb8000)
27382 +#else
27383 +#include <asm/bootsetup.h>
27384 +#define VGABASE                ((void __iomem *)0xffffffff800b8000UL)
27385 +#endif
27386 +
27387 +#define MAX_YPOS       max_ypos
27388 +#define MAX_XPOS       max_xpos
27389 +
27390 +static int max_ypos = 25, max_xpos = 80;
27391 +
27392 +#ifndef CONFIG_XEN
27393 +static int current_ypos = 1, current_xpos = 0; 
27394 +
27395 +static void early_vga_write(struct console *con, const char *str, unsigned n)
27396 +{
27397 +       char c;
27398 +       int  i, k, j;
27399 +
27400 +       while ((c = *str++) != '\0' && n-- > 0) {
27401 +               if (current_ypos >= MAX_YPOS) {
27402 +                       /* scroll 1 line up */
27403 +                       for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
27404 +                               for (i = 0; i < MAX_XPOS; i++) {
27405 +                                       writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
27406 +                                              VGABASE + 2*(MAX_XPOS*j + i));
27407 +                               }
27408 +                       }
27409 +                       for (i = 0; i < MAX_XPOS; i++)
27410 +                               writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
27411 +                       current_ypos = MAX_YPOS-1;
27412 +               }
27413 +               if (c == '\n') {
27414 +                       current_xpos = 0;
27415 +                       current_ypos++;
27416 +               } else if (c != '\r')  {
27417 +                       writew(((0x7 << 8) | (unsigned short) c),
27418 +                              VGABASE + 2*(MAX_XPOS*current_ypos +
27419 +                                               current_xpos++));
27420 +                       if (current_xpos >= MAX_XPOS) {
27421 +                               current_xpos = 0;
27422 +                               current_ypos++;
27423 +                       }
27424 +               }
27425 +       }
27426 +}
27427 +
27428 +static struct console early_vga_console = {
27429 +       .name =         "earlyvga",
27430 +       .write =        early_vga_write,
27431 +       .flags =        CON_PRINTBUFFER,
27432 +       .index =        -1,
27433 +};
27434 +
27435 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ 
27436 +
27437 +static int early_serial_base = 0x3f8;  /* ttyS0 */
27438 +
27439 +#define XMTRDY          0x20
27440 +
27441 +#define DLAB           0x80
27442 +
27443 +#define TXR             0       /*  Transmit register (WRITE) */
27444 +#define RXR             0       /*  Receive register  (READ)  */
27445 +#define IER             1       /*  Interrupt Enable          */
27446 +#define IIR             2       /*  Interrupt ID              */
27447 +#define FCR             2       /*  FIFO control              */
27448 +#define LCR             3       /*  Line control              */
27449 +#define MCR             4       /*  Modem control             */
27450 +#define LSR             5       /*  Line Status               */
27451 +#define MSR             6       /*  Modem Status              */
27452 +#define DLL             0       /*  Divisor Latch Low         */
27453 +#define DLH             1       /*  Divisor latch High        */
27454 +
27455 +static int early_serial_putc(unsigned char ch) 
27456 +{ 
27457 +       unsigned timeout = 0xffff; 
27458 +       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 
27459 +               cpu_relax();
27460 +       outb(ch, early_serial_base + TXR);
27461 +       return timeout ? 0 : -1;
27462 +} 
27463 +
27464 +static void early_serial_write(struct console *con, const char *s, unsigned n)
27465 +{
27466 +       while (*s && n-- > 0) { 
27467 +               early_serial_putc(*s); 
27468 +               if (*s == '\n') 
27469 +                       early_serial_putc('\r'); 
27470 +               s++; 
27471 +       } 
27472 +} 
27473 +
27474 +#define DEFAULT_BAUD 9600
27475 +
27476 +static __init void early_serial_init(char *s)
27477 +{
27478 +       unsigned char c; 
27479 +       unsigned divisor;
27480 +       unsigned baud = DEFAULT_BAUD;
27481 +       char *e;
27482 +
27483 +       if (*s == ',')
27484 +               ++s;
27485 +
27486 +       if (*s) {
27487 +               unsigned port; 
27488 +               if (!strncmp(s,"0x",2)) {
27489 +                       early_serial_base = simple_strtoul(s, &e, 16);
27490 +               } else {
27491 +                       static int bases[] = { 0x3f8, 0x2f8 };
27492 +
27493 +                       if (!strncmp(s,"ttyS",4))
27494 +                               s += 4;
27495 +                       port = simple_strtoul(s, &e, 10);
27496 +                       if (port > 1 || s == e)
27497 +                               port = 0;
27498 +                       early_serial_base = bases[port];
27499 +               }
27500 +               s += strcspn(s, ",");
27501 +               if (*s == ',')
27502 +                       s++;
27503 +       }
27504 +
27505 +       outb(0x3, early_serial_base + LCR);     /* 8n1 */
27506 +       outb(0, early_serial_base + IER);       /* no interrupt */
27507 +       outb(0, early_serial_base + FCR);       /* no fifo */
27508 +       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
27509 +
27510 +       if (*s) {
27511 +               baud = simple_strtoul(s, &e, 0); 
27512 +               if (baud == 0 || s == e) 
27513 +                       baud = DEFAULT_BAUD;
27514 +       } 
27515 +       
27516 +       divisor = 115200 / baud; 
27517 +       c = inb(early_serial_base + LCR); 
27518 +       outb(c | DLAB, early_serial_base + LCR); 
27519 +       outb(divisor & 0xff, early_serial_base + DLL); 
27520 +       outb((divisor >> 8) & 0xff, early_serial_base + DLH); 
27521 +       outb(c & ~DLAB, early_serial_base + LCR);
27522 +}
27523 +
27524 +#else /* CONFIG_XEN */
27525 +
27526 +#undef SCREEN_INFO
27527 +#define SCREEN_INFO screen_info
27528 +extern struct screen_info screen_info;
27529 +
27530 +static void
27531 +early_serial_write(struct console *con, const char *s, unsigned count)
27532 +{
27533 +       int n;
27534 +
27535 +       while (count > 0) {
27536 +               n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
27537 +               if (n <= 0)
27538 +                       break;
27539 +               count -= n;
27540 +               s += n;
27541 +       }
27542 +} 
27543 +
27544 +static __init void early_serial_init(char *s)
27545 +{
27546 +}
27547 +
27548 +/*
27549 + * No early VGA console on Xen, as we do not have convenient ISA-space
27550 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
27551 + */
27552 +#define early_vga_console early_serial_console
27553 +
27554 +#endif
27555 +
27556 +static struct console early_serial_console = {
27557 +       .name =         "earlyser",
27558 +       .write =        early_serial_write,
27559 +       .flags =        CON_PRINTBUFFER,
27560 +       .index =        -1,
27561 +};
27562 +
27563 +/* Console interface to a host file on AMD's SimNow! */
27564 +
27565 +static int simnow_fd;
27566 +
27567 +enum {
27568 +       MAGIC1 = 0xBACCD00A,
27569 +       MAGIC2 = 0xCA110000,
27570 +       XOPEN = 5,
27571 +       XWRITE = 4,
27572 +};
27573 +
27574 +static noinline long simnow(long cmd, long a, long b, long c)
27575 +{
27576 +       long ret;
27577 +       asm volatile("cpuid" :
27578 +                    "=a" (ret) :
27579 +                    "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
27580 +       return ret;
27581 +}
27582 +
27583 +void __init simnow_init(char *str)
27584 +{
27585 +       char *fn = "klog";
27586 +       if (*str == '=')
27587 +               fn = ++str;
27588 +       /* error ignored */
27589 +       simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
27590 +}
27591 +
27592 +static void simnow_write(struct console *con, const char *s, unsigned n)
27593 +{
27594 +       simnow(XWRITE, simnow_fd, (unsigned long)s, n);
27595 +}
27596 +
27597 +static struct console simnow_console = {
27598 +       .name =         "simnow",
27599 +       .write =        simnow_write,
27600 +       .flags =        CON_PRINTBUFFER,
27601 +       .index =        -1,
27602 +};
27603 +
27604 +/* Direct interface for emergencies */
27605 +struct console *early_console = &early_vga_console;
27606 +static int early_console_initialized = 0;
27607 +
27608 +void early_printk(const char *fmt, ...)
27609 +{ 
27610 +       char buf[512]; 
27611 +       int n; 
27612 +       va_list ap;
27613 +
27614 +       va_start(ap,fmt); 
27615 +       n = vscnprintf(buf,512,fmt,ap);
27616 +       early_console->write(early_console,buf,n);
27617 +       va_end(ap); 
27618 +} 
27619 +
27620 +static int __initdata keep_early;
27621 +
27622 +int __init setup_early_printk(char *opt) 
27623 +{  
27624 +       char *space;
27625 +       char buf[256]; 
27626 +
27627 +       if (early_console_initialized)
27628 +               return -1;
27629 +
27630 +       strlcpy(buf,opt,sizeof(buf)); 
27631 +       space = strchr(buf, ' '); 
27632 +       if (space)
27633 +               *space = 0; 
27634 +
27635 +       if (strstr(buf,"keep"))
27636 +               keep_early = 1; 
27637 +
27638 +       if (!strncmp(buf, "serial", 6)) { 
27639 +               early_serial_init(buf + 6);
27640 +               early_console = &early_serial_console;
27641 +       } else if (!strncmp(buf, "ttyS", 4)) { 
27642 +               early_serial_init(buf);
27643 +               early_console = &early_serial_console;          
27644 +       } else if (!strncmp(buf, "vga", 3)
27645 +                  && SCREEN_INFO.orig_video_isVGA == 1) {
27646 +               max_xpos = SCREEN_INFO.orig_video_cols;
27647 +               max_ypos = SCREEN_INFO.orig_video_lines;
27648 +               early_console = &early_vga_console; 
27649 +       } else if (!strncmp(buf, "simnow", 6)) {
27650 +               simnow_init(buf + 6);
27651 +               early_console = &simnow_console;
27652 +               keep_early = 1;
27653 +       }
27654 +       early_console_initialized = 1;
27655 +       register_console(early_console);       
27656 +       return 0;
27657 +}
27658 +
27659 +void __init disable_early_printk(void)
27660 +{ 
27661 +       if (!early_console_initialized || !early_console)
27662 +               return;
27663 +       if (!keep_early) {
27664 +               printk("disabling early console\n");
27665 +               unregister_console(early_console);
27666 +               early_console_initialized = 0;
27667 +       } else { 
27668 +               printk("keeping early console\n");
27669 +       }
27670 +} 
27671 +
27672 +__setup("earlyprintk=", setup_early_printk);
27673 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/entry-xen.S tmp-linux-2.6-xen.patch/arch/x86_64/kernel/entry-xen.S
27674 --- ref-linux-2.6.16.9/arch/x86_64/kernel/entry-xen.S   1970-01-01 01:00:00.000000000 +0100
27675 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/entry-xen.S      2006-04-10 00:05:52.000000000 +0200
27676 @@ -0,0 +1,1149 @@
27677 +/*
27678 + *  linux/arch/x86_64/entry.S
27679 + *
27680 + *  Copyright (C) 1991, 1992  Linus Torvalds
27681 + *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
27682 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
27683 + * 
27684 + *  $Id$
27685 + *
27686 + *  Jun Nakajima <jun.nakajima@intel.com>
27687 + *  Asit Mallick <asit.k.mallick@intel.com>
27688 + *      Modified for Xen
27689 + */
27690 +
27691 +/*
27692 + * entry.S contains the system-call and fault low-level handling routines.
27693 + *
27694 + * NOTE: This code handles signal-recognition, which happens every time
27695 + * after an interrupt and after each system call.
27696 + * 
27697 + * Normal syscalls and interrupts don't save a full stack frame, this is 
27698 + * only done for syscall tracing, signals or fork/exec et.al.
27699 + * 
27700 + * A note on terminology:       
27701 + * - top of stack: Architecture defined interrupt frame from SS to RIP 
27702 + * at the top of the kernel process stack.     
27703 + * - partial stack frame: partially saved registers upto R11.
27704 + * - full stack frame: Like partial stack frame, but all register saved. 
27705 + *     
27706 + * TODO:        
27707 + * - schedule it carefully for the final hardware.
27708 + */
27709 +
27710 +#define ASSEMBLY 1
27711 +#include <linux/config.h>
27712 +#ifdef CONFIG_DEBUG_INFO
27713 +#undef CONFIG_DEBUG_INFO
27714 +#endif
27715 +#include <linux/linkage.h>
27716 +#include <asm/segment.h>
27717 +#include <asm/smp.h>
27718 +#include <asm/cache.h>
27719 +#include <asm/errno.h>
27720 +#include <asm/dwarf2.h>
27721 +#include <asm/calling.h>
27722 +#include <asm/asm-offsets.h>
27723 +#include <asm/msr.h>
27724 +#include <asm/unistd.h>
27725 +#include <asm/thread_info.h>
27726 +#include <asm/hw_irq.h>
27727 +#include <asm/page.h>
27728 +#include <asm/errno.h>
27729 +#include <xen/interface/arch-x86_64.h>
27730 +#include <xen/interface/features.h>
27731 +
27732 +#include "irq_vectors.h"
27733 +
27734 +#include "xen_entry.S"
27735 +       
27736 +       .code64
27737 +
27738 +#ifndef CONFIG_PREEMPT
27739 +#define retint_kernel retint_restore_args
27740 +#endif 
27741 +
27742 +NMI_MASK = 0x80000000
27743 +       
27744 +/*
27745 + * C code is not supposed to know about undefined top of stack. Every time 
27746 + * a C function with an pt_regs argument is called from the SYSCALL based 
27747 + * fast path FIXUP_TOP_OF_STACK is needed.
27748 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
27749 + * manipulation.
27750 + */            
27751 +               
27752 +       /* %rsp:at FRAMEEND */ 
27753 +       .macro FIXUP_TOP_OF_STACK tmp
27754 +       movq    $__USER_CS,CS(%rsp)
27755 +       movq    $-1,RCX(%rsp)
27756 +       .endm
27757 +
27758 +       .macro RESTORE_TOP_OF_STACK tmp,offset=0
27759 +       .endm
27760 +
27761 +       .macro FAKE_STACK_FRAME child_rip
27762 +       /* push in order ss, rsp, eflags, cs, rip */
27763 +       xorl %eax, %eax
27764 +       pushq %rax /* ss */
27765 +       CFI_ADJUST_CFA_OFFSET   8
27766 +       /*CFI_REL_OFFSET        ss,0*/
27767 +       pushq %rax /* rsp */
27768 +       CFI_ADJUST_CFA_OFFSET   8
27769 +       CFI_REL_OFFSET  rsp,0
27770 +       pushq $(1<<9) /* eflags - interrupts on */
27771 +       CFI_ADJUST_CFA_OFFSET   8
27772 +       /*CFI_REL_OFFSET        rflags,0*/
27773 +       pushq $__KERNEL_CS /* cs */
27774 +       CFI_ADJUST_CFA_OFFSET   8
27775 +       /*CFI_REL_OFFSET        cs,0*/
27776 +       pushq \child_rip /* rip */
27777 +       CFI_ADJUST_CFA_OFFSET   8
27778 +       CFI_REL_OFFSET  rip,0
27779 +       pushq   %rax /* orig rax */
27780 +       CFI_ADJUST_CFA_OFFSET   8
27781 +       .endm
27782 +
27783 +       .macro UNFAKE_STACK_FRAME
27784 +       addq $8*6, %rsp
27785 +       CFI_ADJUST_CFA_OFFSET   -(6*8)
27786 +       .endm
27787 +
27788 +       .macro  CFI_DEFAULT_STACK start=1
27789 +       .if \start
27790 +       CFI_STARTPROC   simple
27791 +       CFI_DEF_CFA     rsp,SS+8
27792 +       .else
27793 +       CFI_DEF_CFA_OFFSET SS+8
27794 +       .endif
27795 +       CFI_REL_OFFSET  r15,R15
27796 +       CFI_REL_OFFSET  r14,R14
27797 +       CFI_REL_OFFSET  r13,R13
27798 +       CFI_REL_OFFSET  r12,R12
27799 +       CFI_REL_OFFSET  rbp,RBP
27800 +       CFI_REL_OFFSET  rbx,RBX
27801 +       CFI_REL_OFFSET  r11,R11
27802 +       CFI_REL_OFFSET  r10,R10
27803 +       CFI_REL_OFFSET  r9,R9
27804 +       CFI_REL_OFFSET  r8,R8
27805 +       CFI_REL_OFFSET  rax,RAX
27806 +       CFI_REL_OFFSET  rcx,RCX
27807 +       CFI_REL_OFFSET  rdx,RDX
27808 +       CFI_REL_OFFSET  rsi,RSI
27809 +       CFI_REL_OFFSET  rdi,RDI
27810 +       CFI_REL_OFFSET  rip,RIP
27811 +       /*CFI_REL_OFFSET        cs,CS*/
27812 +       /*CFI_REL_OFFSET        rflags,EFLAGS*/
27813 +       CFI_REL_OFFSET  rsp,RSP
27814 +       /*CFI_REL_OFFSET        ss,SS*/
27815 +       .endm
27816 +
27817 +        /*
27818 +         * Must be consistent with the definition in arch-x86_64.h:    
27819 +         *     struct iret_context {
27820 +         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
27821 +         *     };
27822 +         * #define VGCF_IN_SYSCALL (1<<8) 
27823 +         */
27824 +       .macro HYPERVISOR_IRET flag
27825 +       testb $3,1*8(%rsp)
27826 +       jnz   2f
27827 +       testl $NMI_MASK,2*8(%rsp)
27828 +       jnz   2f
27829 +
27830 +       testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
27831 +       jnz   1f
27832 +
27833 +       /* Direct iret to kernel space. Correct CS and SS. */
27834 +       orb   $3,1*8(%rsp)
27835 +       orb   $3,4*8(%rsp)
27836 +1:     iretq
27837 +
27838 +2:     /* Slow iret via hypervisor. */
27839 +       andl  $~NMI_MASK, 16(%rsp)
27840 +       pushq $\flag
27841 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
27842 +       .endm
27843 +
27844 +        .macro SWITCH_TO_KERNEL ssoff,adjust=0
27845 +       jc  1f
27846 +       orb  $1,\ssoff-\adjust+4(%rsp)
27847 +1:
27848 +        .endm
27849 +
27850 +/*
27851 + * A newly forked process directly context switches into this.
27852 + */    
27853 +/* rdi:        prev */ 
27854 +ENTRY(ret_from_fork)
27855 +       CFI_DEFAULT_STACK
27856 +       call schedule_tail
27857 +       GET_THREAD_INFO(%rcx)
27858 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
27859 +       jnz rff_trace
27860 +rff_action:    
27861 +       RESTORE_REST
27862 +       testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
27863 +       je   int_ret_from_sys_call
27864 +       testl $_TIF_IA32,threadinfo_flags(%rcx)
27865 +       jnz  int_ret_from_sys_call
27866 +       RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
27867 +       jmp ret_from_sys_call
27868 +rff_trace:
27869 +       movq %rsp,%rdi
27870 +       call syscall_trace_leave
27871 +       GET_THREAD_INFO(%rcx)   
27872 +       jmp rff_action
27873 +       CFI_ENDPROC
27874 +
27875 +/*
27876 + * System call entry. Upto 6 arguments in registers are supported.
27877 + *
27878 + * SYSCALL does not save anything on the stack and does not change the
27879 + * stack pointer.
27880 + */
27881 +               
27882 +/*
27883 + * Register setup:     
27884 + * rax  system call number
27885 + * rdi  arg0
27886 + * rcx  return address for syscall/sysret, C arg3 
27887 + * rsi  arg1
27888 + * rdx  arg2   
27889 + * r10  arg3   (--> moved to rcx for C)
27890 + * r8   arg4
27891 + * r9   arg5
27892 + * r11  eflags for syscall/sysret, temporary for C
27893 + * r12-r15,rbp,rbx saved by C code, not touched.               
27894 + * 
27895 + * Interrupts are off on entry.
27896 + * Only called from user space.
27897 + *
27898 + * XXX if we had a free scratch register we could save the RSP into the stack frame
27899 + *      and report it properly in ps. Unfortunately we haven't.
27900 + */                                    
27901 +
27902 +ENTRY(system_call)
27903 +       CFI_STARTPROC   simple
27904 +       CFI_DEF_CFA     rsp,0
27905 +       CFI_REGISTER    rip,rcx
27906 +       /*CFI_REGISTER  rflags,r11*/
27907 +       SAVE_ARGS -8,0
27908 +       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
27909 +        XEN_UNBLOCK_EVENTS(%r11)        
27910 +       GET_THREAD_INFO(%rcx)
27911 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
27912 +       CFI_REMEMBER_STATE
27913 +       jnz tracesys
27914 +       cmpq $__NR_syscall_max,%rax
27915 +       ja badsys
27916 +       movq %r10,%rcx
27917 +       call *sys_call_table(,%rax,8)  # XXX:    rip relative
27918 +       movq %rax,RAX-ARGOFFSET(%rsp)
27919 +/*
27920 + * Syscall return path ending with SYSRET (fast path)
27921 + * Has incomplete stack frame and undefined top of stack. 
27922 + */            
27923 +       .globl ret_from_sys_call
27924 +ret_from_sys_call:
27925 +       movl $_TIF_ALLWORK_MASK,%edi
27926 +       /* edi: flagmask */
27927 +sysret_check:          
27928 +       GET_THREAD_INFO(%rcx)
27929 +        XEN_BLOCK_EVENTS(%rsi)        
27930 +       movl threadinfo_flags(%rcx),%edx
27931 +       andl %edi,%edx
27932 +       CFI_REMEMBER_STATE
27933 +       jnz  sysret_careful 
27934 +        XEN_UNBLOCK_EVENTS(%rsi)                
27935 +       CFI_REGISTER    rip,rcx
27936 +       RESTORE_ARGS 0,8,0
27937 +       /*CFI_REGISTER  rflags,r11*/
27938 +        HYPERVISOR_IRET VGCF_IN_SYSCALL
27939 +
27940 +       /* Handle reschedules */
27941 +       /* edx: work, edi: workmask */  
27942 +sysret_careful:
27943 +       CFI_RESTORE_STATE
27944 +       bt $TIF_NEED_RESCHED,%edx
27945 +       jnc sysret_signal
27946 +        XEN_BLOCK_EVENTS(%rsi)        
27947 +       pushq %rdi
27948 +       CFI_ADJUST_CFA_OFFSET 8
27949 +       call schedule
27950 +       popq  %rdi
27951 +       CFI_ADJUST_CFA_OFFSET -8
27952 +       jmp sysret_check
27953 +
27954 +       /* Handle a signal */ 
27955 +sysret_signal:
27956 +/*     sti */
27957 +        XEN_UNBLOCK_EVENTS(%rsi)        
27958 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
27959 +       jz    1f
27960 +
27961 +       /* Really a signal */
27962 +       /* edx: work flags (arg3) */
27963 +       leaq do_notify_resume(%rip),%rax
27964 +       leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
27965 +       xorl %esi,%esi # oldset -> arg2
27966 +       call ptregscall_common
27967 +1:     movl $_TIF_NEED_RESCHED,%edi
27968 +       jmp sysret_check
27969 +       
27970 +badsys:
27971 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
27972 +       jmp ret_from_sys_call
27973 +
27974 +       /* Do syscall tracing */
27975 +tracesys:                       
27976 +       CFI_RESTORE_STATE
27977 +       SAVE_REST
27978 +       movq $-ENOSYS,RAX(%rsp)
27979 +       FIXUP_TOP_OF_STACK %rdi
27980 +       movq %rsp,%rdi
27981 +       call syscall_trace_enter
27982 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
27983 +       RESTORE_REST
27984 +       cmpq $__NR_syscall_max,%rax
27985 +       ja  1f
27986 +       movq %r10,%rcx  /* fixup for C */
27987 +       call *sys_call_table(,%rax,8)
27988 +       movq %rax,RAX-ARGOFFSET(%rsp)
27989 +1:     SAVE_REST
27990 +       movq %rsp,%rdi
27991 +       call syscall_trace_leave
27992 +       RESTORE_TOP_OF_STACK %rbx
27993 +       RESTORE_REST
27994 +       jmp ret_from_sys_call
27995 +       CFI_ENDPROC
27996 +               
27997 +/* 
27998 + * Syscall return path ending with IRET.
27999 + * Has correct top of stack, but partial stack frame.
28000 + */    
28001 +ENTRY(int_ret_from_sys_call)
28002 +       CFI_STARTPROC   simple
28003 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
28004 +       /*CFI_REL_OFFSET        ss,SS-ARGOFFSET*/
28005 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
28006 +       /*CFI_REL_OFFSET        rflags,EFLAGS-ARGOFFSET*/
28007 +       /*CFI_REL_OFFSET        cs,CS-ARGOFFSET*/
28008 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
28009 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
28010 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
28011 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
28012 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
28013 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
28014 +       CFI_REL_OFFSET  r8,R8-ARGOFFSET
28015 +       CFI_REL_OFFSET  r9,R9-ARGOFFSET
28016 +       CFI_REL_OFFSET  r10,R10-ARGOFFSET
28017 +       CFI_REL_OFFSET  r11,R11-ARGOFFSET
28018 +        XEN_BLOCK_EVENTS(%rsi)
28019 +       testb $3,CS-ARGOFFSET(%rsp)
28020 +        jnz 1f
28021 +        /* Need to set the proper %ss (not NULL) for ring 3 iretq */
28022 +        movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
28023 +        jmp retint_restore_args   # retrun from ring3 kernel
28024 +1:              
28025 +       movl $_TIF_ALLWORK_MASK,%edi
28026 +       /* edi: mask to check */
28027 +int_with_check:
28028 +       GET_THREAD_INFO(%rcx)
28029 +       movl threadinfo_flags(%rcx),%edx
28030 +       andl %edi,%edx
28031 +       jnz   int_careful
28032 +       andl    $~TS_COMPAT,threadinfo_status(%rcx)
28033 +       jmp   retint_restore_args
28034 +
28035 +       /* Either reschedule or signal or syscall exit tracking needed. */
28036 +       /* First do a reschedule test. */
28037 +       /* edx: work, edi: workmask */
28038 +int_careful:
28039 +       bt $TIF_NEED_RESCHED,%edx
28040 +       jnc  int_very_careful
28041 +/*     sti */
28042 +        XEN_UNBLOCK_EVENTS(%rsi)
28043 +       pushq %rdi
28044 +       CFI_ADJUST_CFA_OFFSET 8
28045 +       call schedule
28046 +       popq %rdi
28047 +       CFI_ADJUST_CFA_OFFSET -8
28048 +       cli
28049 +       jmp int_with_check
28050 +
28051 +       /* handle signals and tracing -- both require a full stack frame */
28052 +int_very_careful:
28053 +/*     sti */
28054 +        XEN_UNBLOCK_EVENTS(%rsi)
28055 +       SAVE_REST
28056 +       /* Check for syscall exit trace */      
28057 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
28058 +       jz int_signal
28059 +       pushq %rdi
28060 +       CFI_ADJUST_CFA_OFFSET 8
28061 +       leaq 8(%rsp),%rdi       # &ptregs -> arg1       
28062 +       call syscall_trace_leave
28063 +       popq %rdi
28064 +       CFI_ADJUST_CFA_OFFSET -8
28065 +       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
28066 +       cli
28067 +       jmp int_restore_rest
28068 +       
28069 +int_signal:
28070 +       testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
28071 +       jz 1f
28072 +       movq %rsp,%rdi          # &ptregs -> arg1
28073 +       xorl %esi,%esi          # oldset -> arg2
28074 +       call do_notify_resume
28075 +1:     movl $_TIF_NEED_RESCHED,%edi    
28076 +int_restore_rest:
28077 +       RESTORE_REST
28078 +       cli
28079 +       jmp int_with_check
28080 +       CFI_ENDPROC
28081 +               
28082 +/* 
28083 + * Certain special system calls that need to save a complete full stack frame.
28084 + */                                                            
28085 +       
28086 +       .macro PTREGSCALL label,func,arg
28087 +       .globl \label
28088 +\label:
28089 +       leaq    \func(%rip),%rax
28090 +       leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
28091 +       jmp     ptregscall_common
28092 +       .endm
28093 +
28094 +       CFI_STARTPROC
28095 +
28096 +       PTREGSCALL stub_clone, sys_clone, %r8
28097 +       PTREGSCALL stub_fork, sys_fork, %rdi
28098 +       PTREGSCALL stub_vfork, sys_vfork, %rdi
28099 +       PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
28100 +       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
28101 +       PTREGSCALL stub_iopl, sys_iopl, %rsi
28102 +
28103 +ENTRY(ptregscall_common)
28104 +       popq %r11
28105 +       CFI_ADJUST_CFA_OFFSET -8
28106 +       CFI_REGISTER rip, r11
28107 +       SAVE_REST
28108 +       movq %r11, %r15
28109 +       CFI_REGISTER rip, r15
28110 +       FIXUP_TOP_OF_STACK %r11
28111 +       call *%rax
28112 +       RESTORE_TOP_OF_STACK %r11
28113 +       movq %r15, %r11
28114 +       CFI_REGISTER rip, r11
28115 +       RESTORE_REST
28116 +       pushq %r11
28117 +       CFI_ADJUST_CFA_OFFSET 8
28118 +       CFI_REL_OFFSET rip, 0
28119 +       ret
28120 +       CFI_ENDPROC
28121 +       
28122 +ENTRY(stub_execve)
28123 +       CFI_STARTPROC
28124 +       popq %r11
28125 +       CFI_ADJUST_CFA_OFFSET -8
28126 +       CFI_REGISTER rip, r11
28127 +       SAVE_REST
28128 +       movq %r11, %r15
28129 +       CFI_REGISTER rip, r15
28130 +       FIXUP_TOP_OF_STACK %r11
28131 +       call sys_execve
28132 +       GET_THREAD_INFO(%rcx)
28133 +       bt $TIF_IA32,threadinfo_flags(%rcx)
28134 +       CFI_REMEMBER_STATE
28135 +       jc exec_32bit
28136 +       RESTORE_TOP_OF_STACK %r11
28137 +       movq %r15, %r11
28138 +       CFI_REGISTER rip, r11
28139 +       RESTORE_REST
28140 +       pushq %r11
28141 +       CFI_ADJUST_CFA_OFFSET 8
28142 +       CFI_REL_OFFSET rip, 0
28143 +       ret
28144 +
28145 +exec_32bit:
28146 +       CFI_RESTORE_STATE
28147 +       movq %rax,RAX(%rsp)
28148 +       RESTORE_REST
28149 +       jmp int_ret_from_sys_call
28150 +       CFI_ENDPROC
28151 +       
28152 +/*
28153 + * sigreturn is special because it needs to restore all registers on return.
28154 + * This cannot be done with SYSRET, so use the IRET return path instead.
28155 + */                
28156 +ENTRY(stub_rt_sigreturn)
28157 +       CFI_STARTPROC
28158 +       addq $8, %rsp
28159 +       CFI_ADJUST_CFA_OFFSET   -8
28160 +       SAVE_REST
28161 +       movq %rsp,%rdi
28162 +       FIXUP_TOP_OF_STACK %r11
28163 +       call sys_rt_sigreturn
28164 +       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
28165 +       RESTORE_REST
28166 +       jmp int_ret_from_sys_call
28167 +       CFI_ENDPROC
28168 +
28169 +/*
28170 + * initial frame state for interrupts and exceptions
28171 + */
28172 +       .macro _frame ref
28173 +       CFI_STARTPROC simple
28174 +       CFI_DEF_CFA rsp,SS+8-\ref
28175 +       /*CFI_REL_OFFSET ss,SS-\ref*/
28176 +       CFI_REL_OFFSET rsp,RSP-\ref
28177 +       /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
28178 +       /*CFI_REL_OFFSET cs,CS-\ref*/
28179 +       CFI_REL_OFFSET rip,RIP-\ref
28180 +       .endm
28181 +
28182 +/* initial frame state for interrupts (and exceptions without error code) */
28183 +#define INTR_FRAME _frame RIP
28184 +/* initial frame state for exceptions with error code (and interrupts with
28185 +   vector already pushed) */
28186 +#define XCPT_FRAME _frame ORIG_RAX
28187 +
28188 +/* 
28189 + * Interrupt exit.
28190 + *
28191 + */ 
28192 +
28193 +retint_check:
28194 +       movl threadinfo_flags(%rcx),%edx
28195 +       andl %edi,%edx
28196 +       CFI_REMEMBER_STATE
28197 +       jnz  retint_careful
28198 +retint_restore_args:
28199 +       movl EFLAGS-REST_SKIP(%rsp), %eax
28200 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
28201 +       XEN_GET_VCPU_INFO(%rsi)
28202 +       andb evtchn_upcall_mask(%rsi),%al
28203 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
28204 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
28205 +       XEN_PUT_VCPU_INFO(%rsi)
28206 +               
28207 +       RESTORE_ARGS 0,8,0
28208 +       HYPERVISOR_IRET 0
28209 +       
28210 +       /* edi: workmask, edx: work */
28211 +retint_careful:
28212 +       CFI_RESTORE_STATE
28213 +       bt    $TIF_NEED_RESCHED,%edx
28214 +       jnc   retint_signal
28215 +       XEN_UNBLOCK_EVENTS(%rsi)
28216 +/*     sti */        
28217 +       pushq %rdi
28218 +       CFI_ADJUST_CFA_OFFSET   8
28219 +       call  schedule
28220 +       popq %rdi               
28221 +       CFI_ADJUST_CFA_OFFSET   -8
28222 +       XEN_BLOCK_EVENTS(%rsi)          
28223 +       GET_THREAD_INFO(%rcx)
28224 +/*     cli */
28225 +       jmp retint_check
28226 +       
28227 +retint_signal:
28228 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
28229 +       jz    retint_restore_args
28230 +        XEN_UNBLOCK_EVENTS(%rsi)
28231 +       SAVE_REST
28232 +       movq $-1,ORIG_RAX(%rsp)                         
28233 +       xorl %esi,%esi          # oldset
28234 +       movq %rsp,%rdi          # &pt_regs
28235 +       call do_notify_resume
28236 +       RESTORE_REST
28237 +        XEN_BLOCK_EVENTS(%rsi)         
28238 +       movl $_TIF_NEED_RESCHED,%edi
28239 +       GET_THREAD_INFO(%rcx)
28240 +       jmp retint_check
28241 +
28242 +#ifdef CONFIG_PREEMPT
28243 +       /* Returning to kernel space. Check if we need preemption */
28244 +       /* rcx:  threadinfo. interrupts off. */
28245 +       .p2align
28246 +retint_kernel: 
28247 +       cmpl $0,threadinfo_preempt_count(%rcx)
28248 +       jnz  retint_restore_args
28249 +       bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
28250 +       jnc  retint_restore_args
28251 +       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
28252 +       jnc  retint_restore_args
28253 +       call preempt_schedule_irq
28254 +       jmp retint_kernel       /* check again */
28255 +#endif 
28256 +       CFI_ENDPROC
28257 +       
28258 +/*
28259 + * APIC interrupts.
28260 + */            
28261 +       .macro apicinterrupt num,func
28262 +       INTR_FRAME
28263 +       pushq $~(\num)
28264 +       CFI_ADJUST_CFA_OFFSET 8
28265 +       interrupt \func
28266 +       jmp error_entry
28267 +       CFI_ENDPROC
28268 +       .endm
28269 +
28270 +#ifndef CONFIG_XEN
28271 +ENTRY(thermal_interrupt)
28272 +       apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
28273 +
28274 +ENTRY(threshold_interrupt)
28275 +       apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
28276 +
28277 +#ifdef CONFIG_SMP      
28278 +ENTRY(reschedule_interrupt)
28279 +       apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
28280 +
28281 +       .macro INVALIDATE_ENTRY num
28282 +ENTRY(invalidate_interrupt\num)
28283 +       apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
28284 +       .endm
28285 +
28286 +       INVALIDATE_ENTRY 0
28287 +       INVALIDATE_ENTRY 1
28288 +       INVALIDATE_ENTRY 2
28289 +       INVALIDATE_ENTRY 3
28290 +       INVALIDATE_ENTRY 4
28291 +       INVALIDATE_ENTRY 5
28292 +       INVALIDATE_ENTRY 6
28293 +       INVALIDATE_ENTRY 7
28294 +
28295 +ENTRY(call_function_interrupt)
28296 +       apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
28297 +#endif
28298 +
28299 +#ifdef CONFIG_X86_LOCAL_APIC   
28300 +ENTRY(apic_timer_interrupt)
28301 +       apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
28302 +
28303 +ENTRY(error_interrupt)
28304 +       apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
28305 +
28306 +ENTRY(spurious_interrupt)
28307 +       apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
28308 +#endif
28309 +#endif /* !CONFIG_XEN */
28310 +                               
28311 +/*
28312 + * Exception entry points.
28313 + */            
28314 +       .macro zeroentry sym
28315 +       INTR_FRAME
28316 +        movq (%rsp),%rcx
28317 +        movq 8(%rsp),%r11
28318 +        addq $0x10,%rsp /* skip rcx and r11 */
28319 +       pushq $0        /* push error code/oldrax */ 
28320 +       CFI_ADJUST_CFA_OFFSET 8
28321 +       pushq %rax      /* push real oldrax to the rdi slot */ 
28322 +       CFI_ADJUST_CFA_OFFSET 8
28323 +       leaq  \sym(%rip),%rax
28324 +       jmp error_entry
28325 +       CFI_ENDPROC
28326 +       .endm   
28327 +
28328 +       .macro errorentry sym
28329 +       XCPT_FRAME
28330 +        movq (%rsp),%rcx
28331 +        movq 8(%rsp),%r11
28332 +        addq $0x10,%rsp /* rsp points to the error code */
28333 +       pushq %rax
28334 +       CFI_ADJUST_CFA_OFFSET 8
28335 +       leaq  \sym(%rip),%rax
28336 +       jmp error_entry
28337 +       CFI_ENDPROC
28338 +       .endm
28339 +
28340 +#if 0 /* not XEN */
28341 +       /* error code is on the stack already */
28342 +       /* handle NMI like exceptions that can happen everywhere */
28343 +       .macro paranoidentry sym, ist=0
28344 +        movq (%rsp),%rcx
28345 +        movq 8(%rsp),%r11
28346 +        addq $0x10,%rsp /* skip rcx and r11 */        
28347 +       SAVE_ALL
28348 +       cld
28349 +#if 0 /* not XEN */
28350 +       movl $1,%ebx
28351 +       movl  $MSR_GS_BASE,%ecx
28352 +       rdmsr
28353 +       testl %edx,%edx
28354 +       js    1f
28355 +       swapgs
28356 +       xorl  %ebx,%ebx
28357 +1:
28358 +#endif
28359 +       .if \ist
28360 +       movq    %gs:pda_data_offset, %rbp
28361 +       .endif
28362 +       movq %rsp,%rdi
28363 +       movq ORIG_RAX(%rsp),%rsi
28364 +       movq $-1,ORIG_RAX(%rsp)
28365 +       .if \ist
28366 +       subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
28367 +       .endif
28368 +       call \sym
28369 +       .if \ist
28370 +       addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
28371 +       .endif
28372 +/*     cli */
28373 +       XEN_BLOCK_EVENTS(%rsi)          
28374 +       .endm
28375 +#endif
28376 +       
28377 +/*
28378 + * Exception entry point. This expects an error code/orig_rax on the stack
28379 + * and the exception handler in %rax.  
28380 + */                                            
28381 +ENTRY(error_entry)
28382 +       _frame RDI
28383 +       /* rdi slot contains rax, oldrax contains error code */
28384 +       cld     
28385 +       subq  $14*8,%rsp
28386 +       CFI_ADJUST_CFA_OFFSET   (14*8)
28387 +       movq %rsi,13*8(%rsp)
28388 +       CFI_REL_OFFSET  rsi,RSI
28389 +       movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
28390 +       movq %rdx,12*8(%rsp)
28391 +       CFI_REL_OFFSET  rdx,RDX
28392 +       movq %rcx,11*8(%rsp)
28393 +       CFI_REL_OFFSET  rcx,RCX
28394 +       movq %rsi,10*8(%rsp)    /* store rax */ 
28395 +       CFI_REL_OFFSET  rax,RAX
28396 +       movq %r8, 9*8(%rsp)
28397 +       CFI_REL_OFFSET  r8,R8
28398 +       movq %r9, 8*8(%rsp)
28399 +       CFI_REL_OFFSET  r9,R9
28400 +       movq %r10,7*8(%rsp)
28401 +       CFI_REL_OFFSET  r10,R10
28402 +       movq %r11,6*8(%rsp)
28403 +       CFI_REL_OFFSET  r11,R11
28404 +       movq %rbx,5*8(%rsp) 
28405 +       CFI_REL_OFFSET  rbx,RBX
28406 +       movq %rbp,4*8(%rsp) 
28407 +       CFI_REL_OFFSET  rbp,RBP
28408 +       movq %r12,3*8(%rsp) 
28409 +       CFI_REL_OFFSET  r12,R12
28410 +       movq %r13,2*8(%rsp) 
28411 +       CFI_REL_OFFSET  r13,R13
28412 +       movq %r14,1*8(%rsp) 
28413 +       CFI_REL_OFFSET  r14,R14
28414 +       movq %r15,(%rsp) 
28415 +       CFI_REL_OFFSET  r15,R15
28416 +#if 0        
28417 +       cmpl $__KERNEL_CS,CS(%rsp)
28418 +       je  error_kernelspace
28419 +#endif        
28420 +error_call_handler:
28421 +       movq %rdi, RDI(%rsp)            
28422 +       movq %rsp,%rdi
28423 +       movq ORIG_RAX(%rsp),%rsi        # get error code 
28424 +       movq $-1,ORIG_RAX(%rsp)
28425 +       call *%rax
28426 +error_exit:            
28427 +       RESTORE_REST
28428 +/*     cli */
28429 +       XEN_BLOCK_EVENTS(%rsi)          
28430 +       GET_THREAD_INFO(%rcx)   
28431 +       testb $3,CS-ARGOFFSET(%rsp)
28432 +       jz retint_kernel
28433 +       movl  threadinfo_flags(%rcx),%edx
28434 +       movl  $_TIF_WORK_MASK,%edi      
28435 +       andl  %edi,%edx
28436 +       jnz   retint_careful
28437 +       jmp   retint_restore_args
28438 +
28439 +error_kernelspace:
28440 +         /*
28441 +         * We need to re-write the logic here because we don't do iretq to 
28442 +         * to return to user mode. It's still possible that we get trap/fault
28443 +         * in the kernel (when accessing buffers pointed to by system calls, 
28444 +         * for example).
28445 +         *
28446 +         */           
28447 +#if 0
28448 +       incl %ebx
28449 +       /* There are two places in the kernel that can potentially fault with
28450 +          usergs. Handle them here. The exception handlers after
28451 +          iret run with kernel gs again, so don't set the user space flag.
28452 +          B stepping K8s sometimes report an truncated RIP for IRET 
28453 +          exceptions returning to compat mode. Check for these here too. */
28454 +       leaq iret_label(%rip),%rbp
28455 +       cmpq %rbp,RIP(%rsp) 
28456 +       je   error_swapgs
28457 +       movl %ebp,%ebp  /* zero extend */
28458 +       cmpq %rbp,RIP(%rsp) 
28459 +       je   error_swapgs
28460 +       cmpq $gs_change,RIP(%rsp)
28461 +        je   error_swapgs
28462 +       jmp  error_sti
28463 +#endif        
28464 +       
28465 +ENTRY(hypervisor_callback)
28466 +       zeroentry do_hypervisor_callback
28467 +        
28468 +/*
28469 + * Copied from arch/xen/i386/kernel/entry.S
28470 + */               
28471 +# A note on the "critical region" in our callback handler.
28472 +# We want to avoid stacking callback handlers due to events occurring
28473 +# during handling of the last event. To do this, we keep events disabled
28474 +# until we've done all processing. HOWEVER, we must enable events before
28475 +# popping the stack frame (can't be done atomically) and so it would still
28476 +# be possible to get enough handler activations to overflow the stack.
28477 +# Although unlikely, bugs of that kind are hard to track down, so we'd
28478 +# like to avoid the possibility.
28479 +# So, on entry to the handler we detect whether we interrupted an
28480 +# existing activation in its critical region -- if so, we pop the current
28481 +# activation and restart the handler using the previous one.
28482 +ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
28483 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
28484 +# see the correct pointer to the pt_regs
28485 +       movq %rdi, %rsp            # we don't return, adjust the stack frame
28486 +11:    movq %gs:pda_irqstackptr,%rax
28487 +       incl %gs:pda_irqcount
28488 +       cmovzq %rax,%rsp
28489 +       pushq %rdi
28490 +       call evtchn_do_upcall
28491 +       popq %rsp
28492 +       decl %gs:pda_irqcount
28493 +       jmp  error_exit
28494 +
28495 +#ifdef CONFIG_X86_LOCAL_APIC
28496 +KPROBE_ENTRY(nmi)
28497 +       zeroentry do_nmi_callback
28498 +ENTRY(do_nmi_callback)
28499 +        addq $8, %rsp
28500 +        call do_nmi
28501 +        orl  $NMI_MASK,EFLAGS(%rsp)
28502 +        RESTORE_REST
28503 +        XEN_BLOCK_EVENTS(%rsi)
28504 +        GET_THREAD_INFO(%rcx)
28505 +        jmp  retint_restore_args
28506 +       .previous .text
28507 +#endif
28508 +
28509 +        ALIGN
28510 +restore_all_enable_events:  
28511 +       XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
28512 +
28513 +scrit: /**** START OF CRITICAL REGION ****/
28514 +       XEN_TEST_PENDING(%rsi)
28515 +       jnz  14f                        # process more events if necessary...
28516 +       XEN_PUT_VCPU_INFO(%rsi)
28517 +        RESTORE_ARGS 0,8,0
28518 +        HYPERVISOR_IRET 0
28519 +        
28520 +14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
28521 +       XEN_PUT_VCPU_INFO(%rsi)
28522 +       SAVE_REST
28523 +        movq %rsp,%rdi                  # set the argument again
28524 +       jmp  11b
28525 +ecrit:  /**** END OF CRITICAL REGION ****/
28526 +# At this point, unlike on x86-32, we don't do the fixup to simplify the 
28527 +# code and the stack frame is more complex on x86-64.
28528 +# When the kernel is interrupted in the critical section, the kernel 
28529 +# will do IRET in that case, and everything will be restored at that point, 
28530 +# i.e. it just resumes from the next instruction interrupted with the same context. 
28531 +
28532 +# Hypervisor uses this for application faults while it executes.
28533 +# We get here for two reasons:
28534 +#  1. Fault while reloading DS, ES, FS or GS
28535 +#  2. Fault while executing IRET
28536 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
28537 +# registers that could be reloaded and zeroed the others.
28538 +# Category 2 we fix up by killing the current process. We cannot use the
28539 +# normal Linux return path in this case because if we use the IRET hypercall
28540 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
28541 +# We distinguish between categories by comparing each saved segment register
28542 +# with its current contents: any discrepancy means we in category 1.
28543 +ENTRY(failsafe_callback)
28544 +       movw %ds,%cx
28545 +       cmpw %cx,0x10(%rsp)
28546 +       jne 1f
28547 +       movw %es,%cx
28548 +       cmpw %cx,0x18(%rsp)
28549 +       jne 1f
28550 +       movw %fs,%cx
28551 +       cmpw %cx,0x20(%rsp)
28552 +       jne 1f
28553 +       movw %gs,%cx
28554 +       cmpw %cx,0x28(%rsp)
28555 +       jne 1f
28556 +       /* All segments match their saved values => Category 2 (Bad IRET). */
28557 +       movq (%rsp),%rcx
28558 +       movq 8(%rsp),%r11
28559 +       addq $0x30,%rsp
28560 +       movq $-9999,%rdi        /* better code? */
28561 +       jmp do_exit                     
28562 +1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
28563 +       movq (%rsp),%rcx
28564 +       movq 8(%rsp),%r11
28565 +       addq $0x30,%rsp
28566 +       pushq $0
28567 +       SAVE_ALL
28568 +       jmp error_exit
28569 +#if 0        
28570 +        .section __ex_table,"a"
28571 +        .align 8
28572 +        .quad gs_change,bad_gs
28573 +        .previous
28574 +        .section .fixup,"ax"
28575 +       /* running with kernelgs */
28576 +bad_gs: 
28577 +/*     swapgs          */      /* switch back to user gs */
28578 +       xorl %eax,%eax
28579 +        movl %eax,%gs
28580 +        jmp  2b
28581 +        .previous       
28582 +#endif
28583 +       
28584 +/*
28585 + * Create a kernel thread.
28586 + *
28587 + * C extern interface:
28588 + *     extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
28589 + *
28590 + * asm input arguments:
28591 + *     rdi: fn, rsi: arg, rdx: flags
28592 + */
28593 +ENTRY(kernel_thread)
28594 +       CFI_STARTPROC
28595 +       FAKE_STACK_FRAME $child_rip
28596 +       SAVE_ALL
28597 +
28598 +       # rdi: flags, rsi: usp, rdx: will be &pt_regs
28599 +       movq %rdx,%rdi
28600 +       orq  kernel_thread_flags(%rip),%rdi
28601 +       movq $-1, %rsi
28602 +       movq %rsp, %rdx
28603 +
28604 +       xorl %r8d,%r8d
28605 +       xorl %r9d,%r9d
28606 +       
28607 +       # clone now
28608 +       call do_fork
28609 +       movq %rax,RAX(%rsp)
28610 +       xorl %edi,%edi
28611 +
28612 +       /*
28613 +        * It isn't worth to check for reschedule here,
28614 +        * so internally to the x86_64 port you can rely on kernel_thread()
28615 +        * not to reschedule the child before returning, this avoids the need
28616 +        * of hacks for example to fork off the per-CPU idle tasks.
28617 +         * [Hopefully no generic code relies on the reschedule -AK]    
28618 +        */
28619 +       RESTORE_ALL
28620 +       UNFAKE_STACK_FRAME
28621 +       ret
28622 +       CFI_ENDPROC
28623 +
28624 +       
28625 +child_rip:
28626 +       /*
28627 +        * Here we are in the child and the registers are set as they were
28628 +        * at kernel_thread() invocation in the parent.
28629 +        */
28630 +       movq %rdi, %rax
28631 +       movq %rsi, %rdi
28632 +       call *%rax
28633 +       # exit
28634 +       xorl %edi, %edi
28635 +       call do_exit
28636 +
28637 +/*
28638 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
28639 + *
28640 + * C extern interface:
28641 + *      extern long execve(char *name, char **argv, char **envp)
28642 + *
28643 + * asm input arguments:
28644 + *     rdi: name, rsi: argv, rdx: envp
28645 + *
28646 + * We want to fallback into:
28647 + *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
28648 + *
28649 + * do_sys_execve asm fallback arguments:
28650 + *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
28651 + */
28652 +ENTRY(execve)
28653 +       CFI_STARTPROC
28654 +       FAKE_STACK_FRAME $0
28655 +       SAVE_ALL        
28656 +       call sys_execve
28657 +       movq %rax, RAX(%rsp)    
28658 +       RESTORE_REST
28659 +       testq %rax,%rax
28660 +       jne 1f
28661 +        jmp int_ret_from_sys_call
28662 +1:      RESTORE_ARGS
28663 +       UNFAKE_STACK_FRAME
28664 +       ret
28665 +       CFI_ENDPROC
28666 +
28667 +KPROBE_ENTRY(page_fault)
28668 +       errorentry do_page_fault
28669 +       .previous .text
28670 +
28671 +ENTRY(coprocessor_error)
28672 +       zeroentry do_coprocessor_error
28673 +
28674 +ENTRY(simd_coprocessor_error)
28675 +       zeroentry do_simd_coprocessor_error     
28676 +
28677 +ENTRY(device_not_available)
28678 +       zeroentry math_state_restore
28679 +
28680 +       /* runs on exception stack */
28681 +KPROBE_ENTRY(debug)
28682 +       INTR_FRAME
28683 +/*     pushq $0
28684 +       CFI_ADJUST_CFA_OFFSET 8 */
28685 +       zeroentry do_debug
28686 +/*     jmp paranoid_exit */
28687 +       CFI_ENDPROC
28688 +       .previous .text
28689 +
28690 +#if 0
28691 +       /* runs on exception stack */   
28692 +KPROBE_ENTRY(nmi)
28693 +       INTR_FRAME
28694 +       pushq $-1
28695 +       CFI_ADJUST_CFA_OFFSET 8
28696 +       paranoidentry do_nmi
28697 +       /*
28698 +        * "Paranoid" exit path from exception stack.
28699 +        * Paranoid because this is used by NMIs and cannot take
28700 +        * any kernel state for granted.
28701 +        * We don't do kernel preemption checks here, because only
28702 +        * NMI should be common and it does not enable IRQs and
28703 +        * cannot get reschedule ticks.
28704 +        */
28705 +       /* ebx: no swapgs flag */
28706 +paranoid_exit:
28707 +       testl %ebx,%ebx                         /* swapgs needed? */
28708 +       jnz paranoid_restore
28709 +       testl $3,CS(%rsp)
28710 +       jnz   paranoid_userspace
28711 +paranoid_swapgs:       
28712 +       swapgs
28713 +paranoid_restore:      
28714 +       RESTORE_ALL 8
28715 +       iretq
28716 +paranoid_userspace:    
28717 +       GET_THREAD_INFO(%rcx)
28718 +       movl threadinfo_flags(%rcx),%ebx
28719 +       andl $_TIF_WORK_MASK,%ebx
28720 +       jz paranoid_swapgs
28721 +       movq %rsp,%rdi                  /* &pt_regs */
28722 +       call sync_regs
28723 +       movq %rax,%rsp                  /* switch stack for scheduling */
28724 +       testl $_TIF_NEED_RESCHED,%ebx
28725 +       jnz paranoid_schedule
28726 +       movl %ebx,%edx                  /* arg3: thread flags */
28727 +       sti
28728 +       xorl %esi,%esi                  /* arg2: oldset */
28729 +       movq %rsp,%rdi                  /* arg1: &pt_regs */
28730 +       call do_notify_resume
28731 +       cli
28732 +       jmp paranoid_userspace
28733 +paranoid_schedule:
28734 +       sti
28735 +       call schedule
28736 +       cli
28737 +       jmp paranoid_userspace
28738 +       CFI_ENDPROC
28739 +       .previous .text
28740 +#endif        
28741 +
28742 +KPROBE_ENTRY(int3)
28743 +       INTR_FRAME
28744 +/*     pushq $0
28745 +       CFI_ADJUST_CFA_OFFSET 8 */
28746 +       zeroentry do_int3
28747 +/*     jmp paranoid_exit */
28748 +       CFI_ENDPROC
28749 +       .previous .text
28750 +
28751 +ENTRY(overflow)
28752 +       zeroentry do_overflow
28753 +
28754 +ENTRY(bounds)
28755 +       zeroentry do_bounds
28756 +
28757 +ENTRY(invalid_op)
28758 +       zeroentry do_invalid_op 
28759 +
28760 +ENTRY(coprocessor_segment_overrun)
28761 +       zeroentry do_coprocessor_segment_overrun
28762 +
28763 +ENTRY(reserved)
28764 +       zeroentry do_reserved
28765 +
28766 +#if 0
28767 +       /* runs on exception stack */
28768 +ENTRY(double_fault)
28769 +       XCPT_FRAME
28770 +       paranoidentry do_double_fault
28771 +       jmp paranoid_exit
28772 +       CFI_ENDPROC
28773 +#endif
28774 +
28775 +ENTRY(invalid_TSS)
28776 +       errorentry do_invalid_TSS
28777 +
28778 +ENTRY(segment_not_present)
28779 +       errorentry do_segment_not_present
28780 +
28781 +       /* runs on exception stack */
28782 +ENTRY(stack_segment)
28783 +       XCPT_FRAME
28784 +       errorentry do_stack_segment
28785 +       CFI_ENDPROC
28786 +
28787 +KPROBE_ENTRY(general_protection)
28788 +       errorentry do_general_protection
28789 +       .previous .text
28790 +
28791 +ENTRY(alignment_check)
28792 +       errorentry do_alignment_check
28793 +
28794 +ENTRY(divide_error)
28795 +       zeroentry do_divide_error
28796 +
28797 +ENTRY(spurious_interrupt_bug)
28798 +       zeroentry do_spurious_interrupt_bug
28799 +
28800 +#ifdef CONFIG_X86_MCE
28801 +       /* runs on exception stack */
28802 +ENTRY(machine_check)
28803 +       INTR_FRAME
28804 +       pushq $0
28805 +       CFI_ADJUST_CFA_OFFSET 8 
28806 +       paranoidentry do_machine_check
28807 +       jmp paranoid_exit
28808 +       CFI_ENDPROC
28809 +#endif
28810 +
28811 +ENTRY(call_softirq)
28812 +       CFI_STARTPROC
28813 +       movq %gs:pda_irqstackptr,%rax
28814 +       movq %rsp,%rdx
28815 +       CFI_DEF_CFA_REGISTER    rdx
28816 +       incl %gs:pda_irqcount
28817 +       cmove %rax,%rsp
28818 +       pushq %rdx
28819 +       /*todo CFI_DEF_CFA_EXPRESSION ...*/
28820 +       call __do_softirq
28821 +       popq %rsp
28822 +       CFI_DEF_CFA_REGISTER    rsp
28823 +       decl %gs:pda_irqcount
28824 +       ret
28825 +       CFI_ENDPROC
28826 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/genapic_xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/genapic_xen.c
28827 --- ref-linux-2.6.16.9/arch/x86_64/kernel/genapic_xen.c 1970-01-01 01:00:00.000000000 +0100
28828 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/genapic_xen.c    2006-04-10 00:05:52.000000000 +0200
28829 @@ -0,0 +1,162 @@
28830 +/*
28831 + * Copyright 2004 James Cleverdon, IBM.
28832 + * Subject to the GNU Public License, v.2
28833 + *
28834 + * Xen APIC subarch code.  Maximum 8 CPUs, logical delivery.
28835 + *
28836 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
28837 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
28838 + * James Cleverdon.
28839 + *
28840 + * Hacked to pieces for Xen by Chris Wright.
28841 + */
28842 +#include <linux/config.h>
28843 +#include <linux/threads.h>
28844 +#include <linux/cpumask.h>
28845 +#include <linux/string.h>
28846 +#include <linux/kernel.h>
28847 +#include <linux/ctype.h>
28848 +#include <linux/init.h>
28849 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
28850 +#include <asm/smp.h>
28851 +#include <asm/ipi.h>
28852 +#else
28853 +#include <asm/apic.h>
28854 +#include <asm/apicdef.h>
28855 +#include <asm/genapic.h>
28856 +#endif
28857 +#include <xen/evtchn.h>
28858 +
28859 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
28860 +
28861 +static inline void __send_IPI_one(unsigned int cpu, int vector)
28862 +{
28863 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
28864 +       BUG_ON(irq < 0);
28865 +       notify_remote_via_irq(irq);
28866 +}
28867 +
28868 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
28869 +{
28870 +       int cpu;
28871 +
28872 +       switch (shortcut) {
28873 +       case APIC_DEST_SELF:
28874 +               __send_IPI_one(smp_processor_id(), vector);
28875 +               break;
28876 +       case APIC_DEST_ALLBUT:
28877 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
28878 +                       if (cpu == smp_processor_id())
28879 +                               continue;
28880 +                       if (cpu_isset(cpu, cpu_online_map)) {
28881 +                               __send_IPI_one(cpu, vector);
28882 +                       }
28883 +               }
28884 +               break;
28885 +       case APIC_DEST_ALLINC:
28886 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
28887 +                       if (cpu_isset(cpu, cpu_online_map)) {
28888 +                               __send_IPI_one(cpu, vector);
28889 +                       }
28890 +               }
28891 +               break;
28892 +       default:
28893 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
28894 +                      vector);
28895 +               break;
28896 +       }
28897 +}
28898 +
28899 +static cpumask_t xen_target_cpus(void)
28900 +{
28901 +       return cpu_online_map;
28902 +}
28903 +
28904 +/*
28905 + * Set up the logical destination ID.
28906 + * Do nothing, not called now.
28907 + */
28908 +static void xen_init_apic_ldr(void)
28909 +{
28910 +       Dprintk("%s\n", __FUNCTION__);
28911 +       return;
28912 +}
28913 +
28914 +static void xen_send_IPI_allbutself(int vector)
28915 +{
28916 +       /*
28917 +        * if there are no other CPUs in the system then
28918 +        * we get an APIC send error if we try to broadcast.
28919 +        * thus we have to avoid sending IPIs in this case.
28920 +        */
28921 +       Dprintk("%s\n", __FUNCTION__);
28922 +       if (num_online_cpus() > 1)
28923 +               xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
28924 +}
28925 +
28926 +static void xen_send_IPI_all(int vector)
28927 +{
28928 +       Dprintk("%s\n", __FUNCTION__);
28929 +       xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
28930 +}
28931 +
28932 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
28933 +{
28934 +       unsigned long mask = cpus_addr(cpumask)[0];
28935 +       unsigned int cpu;
28936 +       unsigned long flags;
28937 +
28938 +       Dprintk("%s\n", __FUNCTION__);
28939 +       local_irq_save(flags);
28940 +       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
28941 +
28942 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
28943 +               if (cpu_isset(cpu, cpumask)) {
28944 +                       __send_IPI_one(cpu, vector);
28945 +               }
28946 +       }
28947 +       local_irq_restore(flags);
28948 +}
28949 +
28950 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
28951 +static int xen_apic_id_registered(void)
28952 +{
28953 +       /* better be set */
28954 +       Dprintk("%s\n", __FUNCTION__);
28955 +       return physid_isset(smp_processor_id(), phys_cpu_present_map);
28956 +}
28957 +#endif
28958 +
28959 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
28960 +{
28961 +       Dprintk("%s\n", __FUNCTION__);
28962 +       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
28963 +}
28964 +
28965 +static unsigned int phys_pkg_id(int index_msb)
28966 +{
28967 +       u32 ebx;
28968 +
28969 +       Dprintk("%s\n", __FUNCTION__);
28970 +       ebx = cpuid_ebx(1);
28971 +       return ((ebx >> 24) & 0xFF) >> index_msb;
28972 +}
28973 +
28974 +struct genapic apic_xen =  {
28975 +       .name = "xen",
28976 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
28977 +       .int_delivery_mode = dest_LowestPrio,
28978 +#endif
28979 +       .int_dest_mode = (APIC_DEST_LOGICAL != 0),
28980 +       .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
28981 +       .target_cpus = xen_target_cpus,
28982 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
28983 +       .apic_id_registered = xen_apic_id_registered,
28984 +#endif
28985 +       .init_apic_ldr = xen_init_apic_ldr,
28986 +       .send_IPI_all = xen_send_IPI_all,
28987 +       .send_IPI_allbutself = xen_send_IPI_allbutself,
28988 +       .send_IPI_mask = xen_send_IPI_mask,
28989 +       .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
28990 +       .phys_pkg_id = phys_pkg_id,
28991 +};
28992 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/genapic-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/genapic-xen.c
28993 --- ref-linux-2.6.16.9/arch/x86_64/kernel/genapic-xen.c 1970-01-01 01:00:00.000000000 +0100
28994 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/genapic-xen.c    2006-04-10 00:05:52.000000000 +0200
28995 @@ -0,0 +1,144 @@
28996 +/*
28997 + * Copyright 2004 James Cleverdon, IBM.
28998 + * Subject to the GNU Public License, v.2
28999 + *
29000 + * Generic APIC sub-arch probe layer.
29001 + *
29002 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
29003 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
29004 + * James Cleverdon.
29005 + */
29006 +#include <linux/config.h>
29007 +#include <linux/threads.h>
29008 +#include <linux/cpumask.h>
29009 +#include <linux/string.h>
29010 +#include <linux/kernel.h>
29011 +#include <linux/ctype.h>
29012 +#include <linux/init.h>
29013 +#include <linux/module.h>
29014 +
29015 +#include <asm/smp.h>
29016 +#include <asm/ipi.h>
29017 +
29018 +#if defined(CONFIG_ACPI)
29019 +#include <acpi/acpi_bus.h>
29020 +#endif
29021 +
29022 +/* which logical CPU number maps to which CPU (physical APIC ID) */
29023 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
29024 +EXPORT_SYMBOL(x86_cpu_to_apicid);
29025 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
29026 +
29027 +extern struct genapic apic_cluster;
29028 +extern struct genapic apic_flat;
29029 +extern struct genapic apic_physflat;
29030 +
29031 +#ifndef CONFIG_XEN
29032 +struct genapic *genapic = &apic_flat;
29033 +#else
29034 +extern struct genapic apic_xen;
29035 +struct genapic *genapic = &apic_xen;
29036 +#endif
29037 +
29038 +
29039 +/*
29040 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
29041 + */
29042 +void __init clustered_apic_check(void)
29043 +{
29044 +#ifndef CONFIG_XEN
29045 +       long i;
29046 +       u8 clusters, max_cluster;
29047 +       u8 id;
29048 +       u8 cluster_cnt[NUM_APIC_CLUSTERS];
29049 +       int max_apic = 0;
29050 +
29051 +#if defined(CONFIG_ACPI)
29052 +       /*
29053 +        * Some x86_64 machines use physical APIC mode regardless of how many
29054 +        * procs/clusters are present (x86_64 ES7000 is an example).
29055 +        */
29056 +       if (acpi_fadt.revision > FADT2_REVISION_ID)
29057 +               if (acpi_fadt.force_apic_physical_destination_mode) {
29058 +                       genapic = &apic_cluster;
29059 +                       goto print;
29060 +               }
29061 +#endif
29062 +
29063 +       memset(cluster_cnt, 0, sizeof(cluster_cnt));
29064 +       for (i = 0; i < NR_CPUS; i++) {
29065 +               id = bios_cpu_apicid[i];
29066 +               if (id == BAD_APICID)
29067 +                       continue;
29068 +               if (id > max_apic)
29069 +                       max_apic = id;
29070 +               cluster_cnt[APIC_CLUSTERID(id)]++;
29071 +       }
29072 +
29073 +       /* Don't use clustered mode on AMD platforms. */
29074 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
29075 +               genapic = &apic_physflat;
29076 +#ifndef CONFIG_HOTPLUG_CPU
29077 +               /* In the CPU hotplug case we cannot use broadcast mode
29078 +                  because that opens a race when a CPU is removed.
29079 +                  Stay at physflat mode in this case.
29080 +                  It is bad to do this unconditionally though. Once
29081 +                  we have ACPI platform support for CPU hotplug
29082 +                  we should detect hotplug capablity from ACPI tables and
29083 +                  only do this when really needed. -AK */
29084 +               if (max_apic <= 8)
29085 +                       genapic = &apic_flat;
29086 +#endif
29087 +               goto print;
29088 +       }
29089 +
29090 +       clusters = 0;
29091 +       max_cluster = 0;
29092 +
29093 +       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
29094 +               if (cluster_cnt[i] > 0) {
29095 +                       ++clusters;
29096 +                       if (cluster_cnt[i] > max_cluster)
29097 +                               max_cluster = cluster_cnt[i];
29098 +               }
29099 +       }
29100 +
29101 +       /*
29102 +        * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
29103 +        * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
29104 +        * else physical mode.
29105 +        * (We don't use lowest priority delivery + HW APIC IRQ steering, so
29106 +        * can ignore the clustered logical case and go straight to physical.)
29107 +        */
29108 +       if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
29109 +#ifdef CONFIG_HOTPLUG_CPU
29110 +               /* Don't use APIC shortcuts in CPU hotplug to avoid races */
29111 +               genapic = &apic_physflat;
29112 +#else
29113 +               genapic = &apic_flat;
29114 +#endif
29115 +       } else
29116 +               genapic = &apic_cluster;
29117 +
29118 +print:
29119 +#else
29120 +       /* hardcode to xen apic functions */
29121 +       genapic = &apic_xen;
29122 +#endif
29123 +       printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
29124 +}
29125 +
29126 +/* Same for both flat and clustered. */
29127 +
29128 +#ifdef CONFIG_XEN
29129 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
29130 +#endif
29131 +
29132 +void send_IPI_self(int vector)
29133 +{
29134 +#ifndef CONFIG_XEN
29135 +       __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
29136 +#else
29137 +       xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
29138 +#endif
29139 +}
29140 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/head64-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/head64-xen.c
29141 --- ref-linux-2.6.16.9/arch/x86_64/kernel/head64-xen.c  1970-01-01 01:00:00.000000000 +0100
29142 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/head64-xen.c     2006-04-10 00:05:52.000000000 +0200
29143 @@ -0,0 +1,140 @@
29144 +/*
29145 + *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
29146 + *
29147 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
29148 + *
29149 + *  $Id$
29150 + *
29151 + *  Jun Nakajima <jun.nakajima@intel.com>
29152 + *     Modified for Xen.
29153 + */
29154 +
29155 +#include <linux/init.h>
29156 +#include <linux/linkage.h>
29157 +#include <linux/types.h>
29158 +#include <linux/kernel.h>
29159 +#include <linux/string.h>
29160 +#include <linux/percpu.h>
29161 +
29162 +#include <asm/processor.h>
29163 +#include <asm/proto.h>
29164 +#include <asm/smp.h>
29165 +#include <asm/bootsetup.h>
29166 +#include <asm/setup.h>
29167 +#include <asm/desc.h>
29168 +#include <asm/pgtable.h>
29169 +#include <asm/sections.h>
29170 +
29171 +unsigned long start_pfn;
29172 +
29173 +/* Don't add a printk in there. printk relies on the PDA which is not initialized 
29174 +   yet. */
29175 +#if 0
29176 +static void __init clear_bss(void)
29177 +{
29178 +       memset(__bss_start, 0,
29179 +              (unsigned long) __bss_stop - (unsigned long) __bss_start);
29180 +}
29181 +#endif
29182 +
29183 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
29184 +#define OLD_CL_MAGIC_ADDR      0x90020
29185 +#define OLD_CL_MAGIC            0xA33F
29186 +#define OLD_CL_BASE_ADDR        0x90000
29187 +#define OLD_CL_OFFSET           0x90022
29188 +
29189 +extern char saved_command_line[];
29190 +
29191 +static void __init copy_bootdata(char *real_mode_data)
29192 +{
29193 +#ifndef CONFIG_XEN
29194 +       int new_data;
29195 +       char * command_line;
29196 +
29197 +       memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
29198 +       new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
29199 +       if (!new_data) {
29200 +               if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
29201 +                       printk("so old bootloader that it does not support commandline?!\n");
29202 +                       return;
29203 +               }
29204 +               new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
29205 +               printk("old bootloader convention, maybe loadlin?\n");
29206 +       }
29207 +       command_line = (char *) ((u64)(new_data));
29208 +       memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
29209 +#else
29210 +       int max_cmdline;
29211 +       
29212 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
29213 +               max_cmdline = COMMAND_LINE_SIZE;
29214 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
29215 +       saved_command_line[max_cmdline-1] = '\0';
29216 +#endif
29217 +       printk("Bootdata ok (command line is %s)\n", saved_command_line);
29218 +}
29219 +
29220 +static void __init setup_boot_cpu_data(void)
29221 +{
29222 +       unsigned int dummy, eax;
29223 +
29224 +       /* get vendor info */
29225 +       cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
29226 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
29227 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
29228 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
29229 +
29230 +       /* get cpu type */
29231 +       cpuid(1, &eax, &dummy, &dummy,
29232 +               (unsigned int *) &boot_cpu_data.x86_capability);
29233 +       boot_cpu_data.x86 = (eax >> 8) & 0xf;
29234 +       boot_cpu_data.x86_model = (eax >> 4) & 0xf;
29235 +       boot_cpu_data.x86_mask = eax & 0xf;
29236 +}
29237 +
29238 +void __init x86_64_start_kernel(char * real_mode_data)
29239 +{
29240 +       char *s;
29241 +       int i;
29242 +
29243 +       xen_start_info = (struct start_info *)real_mode_data;
29244 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
29245 +               phys_to_machine_mapping =
29246 +                       (unsigned long *)xen_start_info->mfn_list;
29247 +               start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
29248 +                       xen_start_info->nr_pt_frames;
29249 +       }
29250 +
29251 +#if 0
29252 +       for (i = 0; i < 256; i++)
29253 +               set_intr_gate(i, early_idt_handler);
29254 +       asm volatile("lidt %0" :: "m" (idt_descr));
29255 +#endif
29256 +
29257 +       for (i = 0; i < NR_CPUS; i++)
29258 +               cpu_pda(i) = &boot_cpu_pda[i];
29259 +
29260 +       pda_init(0);
29261 +       copy_bootdata(real_mode_data);
29262 +#ifdef CONFIG_SMP
29263 +       cpu_set(0, cpu_online_map);
29264 +#endif
29265 +       s = strstr(saved_command_line, "earlyprintk=");
29266 +       if (s != NULL)
29267 +               setup_early_printk(strchr(s, '=') + 1);
29268 +#ifdef CONFIG_NUMA
29269 +       s = strstr(saved_command_line, "numa=");
29270 +       if (s != NULL)
29271 +               numa_setup(s+5);
29272 +#endif
29273 +#ifdef CONFIG_X86_IO_APIC
29274 +       if (strstr(saved_command_line, "disableapic"))
29275 +               disable_apic = 1;
29276 +#endif
29277 +       /* You need early console to see that */
29278 +       if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
29279 +               panic("Kernel too big for kernel mapping\n");
29280 +
29281 +       setup_boot_cpu_data();
29282 +       start_kernel();
29283 +}
29284 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/head-xen.S tmp-linux-2.6-xen.patch/arch/x86_64/kernel/head-xen.S
29285 --- ref-linux-2.6.16.9/arch/x86_64/kernel/head-xen.S    1970-01-01 01:00:00.000000000 +0100
29286 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/head-xen.S       2006-04-10 00:05:52.000000000 +0200
29287 @@ -0,0 +1,156 @@
29288 +/*
29289 + *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
29290 + *
29291 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
29292 + *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
29293 + *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
29294 + *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
29295 + *
29296 + *  $Id$
29297 + *
29298 + *  Jun Nakajima <jun.nakajima@intel.com>
29299 + *    Modified for Xen                                
29300 + */
29301 +
29302 +
29303 +#include <linux/linkage.h>
29304 +#include <linux/threads.h>
29305 +#include <linux/init.h>
29306 +#include <asm/desc.h>
29307 +#include <asm/segment.h>
29308 +#include <asm/page.h>
29309 +#include <asm/msr.h>
29310 +#include <asm/cache.h>
29311 +
29312 +       .text
29313 +       .code64
29314 +       .globl startup_64
29315 +startup_64:
29316 +ENTRY(_start)
29317 +       movq $(init_thread_union+THREAD_SIZE-8),%rsp
29318 +       /* zero EFLAGS after setting rsp */
29319 +       pushq $0
29320 +       popfq
29321 +
29322 +       /* rsi is pointer to startup info structure.
29323 +          pass it to C */
29324 +       movq %rsi,%rdi
29325 +       jmp x86_64_start_kernel
29326 +
29327 +ENTRY(stext)
29328 +ENTRY(_stext)
29329 +
29330 +       $page = 0
29331 +#define NEXT_PAGE(name) \
29332 +       $page = $page + 1; \
29333 +       .org $page * 0x1000; \
29334 +       phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
29335 +ENTRY(name)
29336 +
29337 +NEXT_PAGE(init_level4_pgt)
29338 +       /* This gets initialized in x86_64_start_kernel */
29339 +       .fill   512,8,0
29340 +
29341 +        /*
29342 +         * We update two pgd entries to make kernel and user pgd consistent
29343 +         * at pgd_populate(). It can be used for kernel modules. So we place 
29344 +         * this page here for those cases to avoid memory corruption.
29345 +         * We also use this page to establish the initiali mapping for
29346 +         * vsyscall area.
29347 +         */
29348 +NEXT_PAGE(init_level4_user_pgt)
29349 +       .fill   512,8,0
29350 +
29351 +NEXT_PAGE(level3_kernel_pgt)
29352 +       .fill   512,8,0
29353 +
29354 +        /*
29355 +         * This is used for vsyscall area mapping as we have a different
29356 +         * level4 page table for user.
29357 +         */
29358 +NEXT_PAGE(level3_user_pgt)
29359 +        .fill  512,8,0
29360 +
29361 +NEXT_PAGE(level2_kernel_pgt)
29362 +       .fill   512,8,0
29363 +
29364 +NEXT_PAGE(empty_zero_page)
29365 +       .skip PAGE_SIZE
29366 +
29367 +NEXT_PAGE(hypercall_page)
29368 +       .fill   512,8,0
29369 +
29370 +#undef NEXT_PAGE
29371 +
29372 +       .data
29373 +
29374 +       .align 16
29375 +       .globl cpu_gdt_descr
29376 +cpu_gdt_descr:
29377 +       .word   gdt_end-cpu_gdt_table
29378 +gdt:
29379 +       .quad   cpu_gdt_table
29380 +#ifdef CONFIG_SMP
29381 +       .rept   NR_CPUS-1
29382 +       .word   0
29383 +       .quad   0
29384 +       .endr
29385 +#endif
29386 +
29387 +/* We need valid kernel segments for data and code in long mode too
29388 + * IRET will check the segment types  kkeil 2000/10/28
29389 + * Also sysret mandates a special GDT layout 
29390 + */
29391 +                               
29392 +       .section .data.page_aligned, "aw"
29393 +       .align PAGE_SIZE
29394 +
29395 +/* The TLS descriptors are currently at a different place compared to i386.
29396 +   Hopefully nobody expects them at a fixed place (Wine?) */
29397 +
29398 +ENTRY(cpu_gdt_table)
29399 +       .quad   0x0000000000000000      /* NULL descriptor */
29400 +       .quad   0x0                     /* unused */
29401 +       .quad   0x00af9a000000ffff      /* __KERNEL_CS */
29402 +       .quad   0x00cf92000000ffff      /* __KERNEL_DS */
29403 +       .quad   0x00cffa000000ffff      /* __USER32_CS */
29404 +       .quad   0x00cff2000000ffff      /* __USER_DS, __USER32_DS  */
29405 +       .quad   0x00affa000000ffff      /* __USER_CS */
29406 +       .quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
29407 +       .quad   0,0                     /* TSS */
29408 +       .quad   0,0                     /* LDT */
29409 +       .quad   0,0,0                   /* three TLS descriptors */
29410 +       .quad   0                       /* unused */
29411 +gdt_end:
29412 +       /* asm/segment.h:GDT_ENTRIES must match this */
29413 +       /* This should be a multiple of the cache line size */
29414 +       /* GDTs of other CPUs are now dynamically allocated */
29415 +
29416 +       /* zero the remaining page */
29417 +       .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
29418 +
29419 +/*
29420 + * __xen_guest information
29421 + */
29422 +.macro utoh value
29423 + .if (\value) < 0 || (\value) >= 0x10
29424 +       utoh (((\value)>>4)&0x0fffffffffffffff)
29425 + .endif
29426 + .if ((\value) & 0xf) < 10
29427 +  .byte '0' + ((\value) & 0xf)
29428 + .else
29429 +  .byte 'A' + ((\value) & 0xf) - 10
29430 + .endif
29431 +.endm
29432 +
29433 +.section __xen_guest
29434 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
29435 +       .ascii  ",XEN_VER=xen-3.0"
29436 +       .ascii  ",VIRT_BASE=0x"; utoh __START_KERNEL_map
29437 +       .ascii  ",HYPERCALL_PAGE=0x"; utoh (phys_hypercall_page >> PAGE_SHIFT)
29438 +       .ascii  ",FEATURES=writable_page_tables"
29439 +       .ascii           "|writable_descriptor_tables"
29440 +       .ascii           "|auto_translated_physmap"
29441 +       .ascii           "|supervisor_mode_kernel"
29442 +       .ascii  ",LOADER=generic"
29443 +       .byte   0
29444 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/init_task.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/init_task.c
29445 --- ref-linux-2.6.16.9/arch/x86_64/kernel/init_task.c   2006-04-19 08:10:14.000000000 +0200
29446 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/init_task.c      2006-04-10 00:05:52.000000000 +0200
29447 @@ -37,6 +37,8 @@ union thread_union init_thread_union 
29448  struct task_struct init_task = INIT_TASK(init_task);
29449  
29450  EXPORT_SYMBOL(init_task);
29451 +
29452 +#ifndef CONFIG_X86_NO_TSS
29453  /*
29454   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
29455   * no more per-task TSS's. The TSS size is kept cacheline-aligned
29456 @@ -45,5 +47,6 @@ EXPORT_SYMBOL(init_task);
29457   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
29458   */ 
29459  DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
29460 +#endif
29461  
29462  #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
29463 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/io_apic-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/io_apic-xen.c
29464 --- ref-linux-2.6.16.9/arch/x86_64/kernel/io_apic-xen.c 1970-01-01 01:00:00.000000000 +0100
29465 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/io_apic-xen.c    2006-04-10 00:05:52.000000000 +0200
29466 @@ -0,0 +1,2231 @@
29467 +/*
29468 + *     Intel IO-APIC support for multi-Pentium hosts.
29469 + *
29470 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
29471 + *
29472 + *     Many thanks to Stig Venaas for trying out countless experimental
29473 + *     patches and reporting/debugging problems patiently!
29474 + *
29475 + *     (c) 1999, Multiple IO-APIC support, developed by
29476 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
29477 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
29478 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
29479 + *     and Ingo Molnar <mingo@redhat.com>
29480 + *
29481 + *     Fixes
29482 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
29483 + *                                     thanks to Eric Gilmore
29484 + *                                     and Rolf G. Tews
29485 + *                                     for testing these extensively
29486 + *     Paul Diefenbaugh        :       Added full ACPI support
29487 + */
29488 +
29489 +#include <linux/mm.h>
29490 +#include <linux/interrupt.h>
29491 +#include <linux/init.h>
29492 +#include <linux/delay.h>
29493 +#include <linux/sched.h>
29494 +#include <linux/config.h>
29495 +#include <linux/smp_lock.h>
29496 +#include <linux/mc146818rtc.h>
29497 +#include <linux/acpi.h>
29498 +#include <linux/sysdev.h>
29499 +#ifdef CONFIG_ACPI
29500 +#include <acpi/acpi_bus.h>
29501 +#endif
29502 +
29503 +#include <asm/io.h>
29504 +#include <asm/smp.h>
29505 +#include <asm/desc.h>
29506 +#include <asm/proto.h>
29507 +#include <asm/mach_apic.h>
29508 +#include <asm/acpi.h>
29509 +#include <asm/dma.h>
29510 +
29511 +#define __apicdebuginit  __init
29512 +
29513 +int sis_apic_bug; /* not actually supported, dummy for compile */
29514 +
29515 +static int no_timer_check;
29516 +
29517 +int disable_timer_pin_1 __initdata;
29518 +
29519 +#ifndef CONFIG_XEN
29520 +int timer_over_8254 __initdata = 1;
29521 +
29522 +/* Where if anywhere is the i8259 connect in external int mode */
29523 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
29524 +#endif
29525 +
29526 +static DEFINE_SPINLOCK(ioapic_lock);
29527 +
29528 +/*
29529 + * # of IRQ routing registers
29530 + */
29531 +int nr_ioapic_registers[MAX_IO_APICS];
29532 +
29533 +/*
29534 + * Rough estimation of how many shared IRQs there are, can
29535 + * be changed anytime.
29536 + */
29537 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
29538 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
29539 +
29540 +/*
29541 + * This is performance-critical, we want to do it O(1)
29542 + *
29543 + * the indexing order of this array favors 1:1 mappings
29544 + * between pins and IRQs.
29545 + */
29546 +
29547 +static struct irq_pin_list {
29548 +       short apic, pin, next;
29549 +} irq_2_pin[PIN_MAP_SIZE];
29550 +
29551 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
29552 +#ifdef CONFIG_PCI_MSI
29553 +#define vector_to_irq(vector)  \
29554 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
29555 +#else
29556 +#define vector_to_irq(vector)  (vector)
29557 +#endif
29558 +
29559 +#ifdef CONFIG_XEN
29560 +
29561 +#include <xen/interface/xen.h>
29562 +#include <xen/interface/physdev.h>
29563 +
29564 +/* Fake i8259 */
29565 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
29566 +#define disable_8259A_irq(_irq)  ((void)0)
29567 +#define i8259A_irq_pending(_irq) (0)
29568 +
29569 +unsigned long io_apic_irqs;
29570 +
29571 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
29572 +{
29573 +       physdev_op_t op;
29574 +       int ret;
29575 +
29576 +       op.cmd = PHYSDEVOP_APIC_READ;
29577 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
29578 +       op.u.apic_op.reg = reg;
29579 +       ret = HYPERVISOR_physdev_op(&op);
29580 +       if (ret)
29581 +               return ret;
29582 +       return op.u.apic_op.value;
29583 +}
29584 +
29585 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
29586 +{
29587 +       physdev_op_t op;
29588 +
29589 +       op.cmd = PHYSDEVOP_APIC_WRITE;
29590 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
29591 +       op.u.apic_op.reg = reg;
29592 +       op.u.apic_op.value = value;
29593 +       HYPERVISOR_physdev_op(&op);
29594 +}
29595 +
29596 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
29597 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
29598 +
29599 +#define clear_IO_APIC() ((void)0)
29600 +
29601 +#else
29602 +
29603 +#ifdef CONFIG_SMP
29604 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
29605 +{
29606 +       unsigned long flags;
29607 +       unsigned int dest;
29608 +       cpumask_t tmp;
29609 +
29610 +       cpus_and(tmp, mask, cpu_online_map);
29611 +       if (cpus_empty(tmp))
29612 +               tmp = TARGET_CPUS;
29613 +
29614 +       cpus_and(mask, tmp, CPU_MASK_ALL);
29615 +
29616 +       dest = cpu_mask_to_apicid(mask);
29617 +
29618 +       /*
29619 +        * Only the high 8 bits are valid.
29620 +        */
29621 +       dest = SET_APIC_LOGICAL_ID(dest);
29622 +
29623 +       spin_lock_irqsave(&ioapic_lock, flags);
29624 +       __DO_ACTION(1, = dest, )
29625 +       set_irq_info(irq, mask);
29626 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29627 +}
29628 +#endif
29629 +
29630 +#endif /* !CONFIG_XEN */
29631 +
29632 +/*
29633 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
29634 + * shared ISA-space IRQs, so we have to support them. We are super
29635 + * fast in the common case, and fast for shared ISA-space IRQs.
29636 + */
29637 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
29638 +{
29639 +       static int first_free_entry = NR_IRQS;
29640 +       struct irq_pin_list *entry = irq_2_pin + irq;
29641 +
29642 +       BUG_ON(irq >= NR_IRQS);
29643 +       while (entry->next)
29644 +               entry = irq_2_pin + entry->next;
29645 +
29646 +       if (entry->pin != -1) {
29647 +               entry->next = first_free_entry;
29648 +               entry = irq_2_pin + entry->next;
29649 +               if (++first_free_entry >= PIN_MAP_SIZE)
29650 +                       panic("io_apic.c: ran out of irq_2_pin entries!");
29651 +       }
29652 +       entry->apic = apic;
29653 +       entry->pin = pin;
29654 +}
29655 +
29656 +#ifndef CONFIG_XEN
29657 +#define __DO_ACTION(R, ACTION, FINAL)                                  \
29658 +                                                                       \
29659 +{                                                                      \
29660 +       int pin;                                                        \
29661 +       struct irq_pin_list *entry = irq_2_pin + irq;                   \
29662 +                                                                       \
29663 +       BUG_ON(irq >= NR_IRQS);                                         \
29664 +       for (;;) {                                                      \
29665 +               unsigned int reg;                                       \
29666 +               pin = entry->pin;                                       \
29667 +               if (pin == -1)                                          \
29668 +                       break;                                          \
29669 +               reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
29670 +               reg ACTION;                                             \
29671 +               io_apic_modify(entry->apic, reg);                       \
29672 +               if (!entry->next)                                       \
29673 +                       break;                                          \
29674 +               entry = irq_2_pin + entry->next;                        \
29675 +       }                                                               \
29676 +       FINAL;                                                          \
29677 +}
29678 +
29679 +#define DO_ACTION(name,R,ACTION, FINAL)                                        \
29680 +                                                                       \
29681 +       static void name##_IO_APIC_irq (unsigned int irq)               \
29682 +       __DO_ACTION(R, ACTION, FINAL)
29683 +
29684 +DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
29685 +                                               /* mask = 1 */
29686 +DO_ACTION( __unmask,           0, &= 0xfffeffff, )
29687 +                                               /* mask = 0 */
29688 +
29689 +static void mask_IO_APIC_irq (unsigned int irq)
29690 +{
29691 +       unsigned long flags;
29692 +
29693 +       spin_lock_irqsave(&ioapic_lock, flags);
29694 +       __mask_IO_APIC_irq(irq);
29695 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29696 +}
29697 +
29698 +static void unmask_IO_APIC_irq (unsigned int irq)
29699 +{
29700 +       unsigned long flags;
29701 +
29702 +       spin_lock_irqsave(&ioapic_lock, flags);
29703 +       __unmask_IO_APIC_irq(irq);
29704 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29705 +}
29706 +
29707 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
29708 +{
29709 +       struct IO_APIC_route_entry entry;
29710 +       unsigned long flags;
29711 +
29712 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
29713 +       spin_lock_irqsave(&ioapic_lock, flags);
29714 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
29715 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
29716 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29717 +       if (entry.delivery_mode == dest_SMI)
29718 +               return;
29719 +       /*
29720 +        * Disable it in the IO-APIC irq-routing table:
29721 +        */
29722 +       memset(&entry, 0, sizeof(entry));
29723 +       entry.mask = 1;
29724 +       spin_lock_irqsave(&ioapic_lock, flags);
29725 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
29726 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
29727 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29728 +}
29729 +
29730 +static void clear_IO_APIC (void)
29731 +{
29732 +       int apic, pin;
29733 +
29734 +       for (apic = 0; apic < nr_ioapics; apic++)
29735 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
29736 +                       clear_IO_APIC_pin(apic, pin);
29737 +}
29738 +
29739 +#endif /* !CONFIG_XEN */
29740 +
29741 +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
29742 +
29743 +/*
29744 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
29745 + * specific CPU-side IRQs.
29746 + */
29747 +
29748 +#define MAX_PIRQS 8
29749 +static int pirq_entries [MAX_PIRQS];
29750 +static int pirqs_enabled;
29751 +int skip_ioapic_setup;
29752 +int ioapic_force;
29753 +
29754 +/* dummy parsing: see setup.c */
29755 +
29756 +static int __init disable_ioapic_setup(char *str)
29757 +{
29758 +       skip_ioapic_setup = 1;
29759 +       return 1;
29760 +}
29761 +
29762 +static int __init enable_ioapic_setup(char *str)
29763 +{
29764 +       ioapic_force = 1;
29765 +       skip_ioapic_setup = 0;
29766 +       return 1;
29767 +}
29768 +
29769 +__setup("noapic", disable_ioapic_setup);
29770 +__setup("apic", enable_ioapic_setup);
29771 +
29772 +#ifndef CONFIG_XEN
29773 +static int __init setup_disable_8254_timer(char *s)
29774 +{
29775 +       timer_over_8254 = -1;
29776 +       return 1;
29777 +}
29778 +static int __init setup_enable_8254_timer(char *s)
29779 +{
29780 +       timer_over_8254 = 2;
29781 +       return 1;
29782 +}
29783 +
29784 +__setup("disable_8254_timer", setup_disable_8254_timer);
29785 +__setup("enable_8254_timer", setup_enable_8254_timer);
29786 +#endif /* !CONFIG_XEN */
29787 +
29788 +#include <asm/pci-direct.h>
29789 +#include <linux/pci_ids.h>
29790 +#include <linux/pci.h>
29791 +
29792 +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
29793 +   off. Check for an Nvidia or VIA PCI bridge and turn it off.
29794 +   Use pci direct infrastructure because this runs before the PCI subsystem. 
29795 +
29796 +   Can be overwritten with "apic"
29797 +
29798 +   And another hack to disable the IOMMU on VIA chipsets.
29799 +
29800 +   ... and others. Really should move this somewhere else.
29801 +
29802 +   Kludge-O-Rama. */
29803 +void __init check_ioapic(void) 
29804 +{ 
29805 +       int num,slot,func; 
29806 +       /* Poor man's PCI discovery */
29807 +       for (num = 0; num < 32; num++) { 
29808 +               for (slot = 0; slot < 32; slot++) { 
29809 +                       for (func = 0; func < 8; func++) { 
29810 +                               u32 class;
29811 +                               u32 vendor;
29812 +                               u8 type;
29813 +                               class = read_pci_config(num,slot,func,
29814 +                                                       PCI_CLASS_REVISION);
29815 +                               if (class == 0xffffffff)
29816 +                                       break; 
29817 +
29818 +                               if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
29819 +                                       continue; 
29820 +
29821 +                               vendor = read_pci_config(num, slot, func, 
29822 +                                                        PCI_VENDOR_ID);
29823 +                               vendor &= 0xffff;
29824 +                               switch (vendor) { 
29825 +                               case PCI_VENDOR_ID_VIA:
29826 +#ifdef CONFIG_GART_IOMMU
29827 +                                       if ((end_pfn > MAX_DMA32_PFN ||
29828 +                                            force_iommu) &&
29829 +                                           !iommu_aperture_allowed) {
29830 +                                               printk(KERN_INFO
29831 +    "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
29832 +                                               iommu_aperture_disabled = 1;
29833 +                                       }
29834 +#endif
29835 +                                       return;
29836 +                               case PCI_VENDOR_ID_NVIDIA:
29837 +#ifdef CONFIG_ACPI
29838 +                                       /* All timer overrides on Nvidia
29839 +                                          seem to be wrong. Skip them. */
29840 +                                       acpi_skip_timer_override = 1;
29841 +                                       printk(KERN_INFO 
29842 +            "Nvidia board detected. Ignoring ACPI timer override.\n");
29843 +#endif
29844 +                                       /* RED-PEN skip them on mptables too? */
29845 +                                       return;
29846 +                               case PCI_VENDOR_ID_ATI:
29847 +
29848 +                               /* This should be actually default, but
29849 +                                  for 2.6.16 let's do it for ATI only where
29850 +                                  it's really needed. */
29851 +#ifndef CONFIG_XEN
29852 +                                       if (timer_over_8254 == 1) {     
29853 +                                               timer_over_8254 = 0;    
29854 +                                       printk(KERN_INFO
29855 +               "ATI board detected. Disabling timer routing over 8254.\n");
29856 +                                       }       
29857 +#endif
29858 +                                       return;
29859 +                               } 
29860 +
29861 +
29862 +                               /* No multi-function device? */
29863 +                               type = read_pci_config_byte(num,slot,func,
29864 +                                                           PCI_HEADER_TYPE);
29865 +                               if (!(type & 0x80))
29866 +                                       break;
29867 +                       } 
29868 +               }
29869 +       }
29870 +} 
29871 +
29872 +static int __init ioapic_pirq_setup(char *str)
29873 +{
29874 +       int i, max;
29875 +       int ints[MAX_PIRQS+1];
29876 +
29877 +       get_options(str, ARRAY_SIZE(ints), ints);
29878 +
29879 +       for (i = 0; i < MAX_PIRQS; i++)
29880 +               pirq_entries[i] = -1;
29881 +
29882 +       pirqs_enabled = 1;
29883 +       apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
29884 +       max = MAX_PIRQS;
29885 +       if (ints[0] < MAX_PIRQS)
29886 +               max = ints[0];
29887 +
29888 +       for (i = 0; i < max; i++) {
29889 +               apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
29890 +               /*
29891 +                * PIRQs are mapped upside down, usually.
29892 +                */
29893 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
29894 +       }
29895 +       return 1;
29896 +}
29897 +
29898 +__setup("pirq=", ioapic_pirq_setup);
29899 +
29900 +/*
29901 + * Find the IRQ entry number of a certain pin.
29902 + */
29903 +static int find_irq_entry(int apic, int pin, int type)
29904 +{
29905 +       int i;
29906 +
29907 +       for (i = 0; i < mp_irq_entries; i++)
29908 +               if (mp_irqs[i].mpc_irqtype == type &&
29909 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
29910 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
29911 +                   mp_irqs[i].mpc_dstirq == pin)
29912 +                       return i;
29913 +
29914 +       return -1;
29915 +}
29916 +
29917 +#ifndef CONFIG_XEN
29918 +/*
29919 + * Find the pin to which IRQ[irq] (ISA) is connected
29920 + */
29921 +static int __init find_isa_irq_pin(int irq, int type)
29922 +{
29923 +       int i;
29924 +
29925 +       for (i = 0; i < mp_irq_entries; i++) {
29926 +               int lbus = mp_irqs[i].mpc_srcbus;
29927 +
29928 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
29929 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
29930 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
29931 +                   (mp_irqs[i].mpc_irqtype == type) &&
29932 +                   (mp_irqs[i].mpc_srcbusirq == irq))
29933 +
29934 +                       return mp_irqs[i].mpc_dstirq;
29935 +       }
29936 +       return -1;
29937 +}
29938 +
29939 +static int __init find_isa_irq_apic(int irq, int type)
29940 +{
29941 +       int i;
29942 +
29943 +       for (i = 0; i < mp_irq_entries; i++) {
29944 +               int lbus = mp_irqs[i].mpc_srcbus;
29945 +
29946 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
29947 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
29948 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
29949 +                   (mp_irqs[i].mpc_irqtype == type) &&
29950 +                   (mp_irqs[i].mpc_srcbusirq == irq))
29951 +                       break;
29952 +       }
29953 +       if (i < mp_irq_entries) {
29954 +               int apic;
29955 +               for(apic = 0; apic < nr_ioapics; apic++) {
29956 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
29957 +                               return apic;
29958 +               }
29959 +       }
29960 +
29961 +       return -1;
29962 +}
29963 +#endif
29964 +
29965 +/*
29966 + * Find a specific PCI IRQ entry.
29967 + * Not an __init, possibly needed by modules
29968 + */
29969 +static int pin_2_irq(int idx, int apic, int pin);
29970 +
29971 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
29972 +{
29973 +       int apic, i, best_guess = -1;
29974 +
29975 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
29976 +               bus, slot, pin);
29977 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
29978 +               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
29979 +               return -1;
29980 +       }
29981 +       for (i = 0; i < mp_irq_entries; i++) {
29982 +               int lbus = mp_irqs[i].mpc_srcbus;
29983 +
29984 +               for (apic = 0; apic < nr_ioapics; apic++)
29985 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
29986 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
29987 +                               break;
29988 +
29989 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
29990 +                   !mp_irqs[i].mpc_irqtype &&
29991 +                   (bus == lbus) &&
29992 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
29993 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
29994 +
29995 +                       if (!(apic || IO_APIC_IRQ(irq)))
29996 +                               continue;
29997 +
29998 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
29999 +                               return irq;
30000 +                       /*
30001 +                        * Use the first all-but-pin matching entry as a
30002 +                        * best-guess fuzzy result for broken mptables.
30003 +                        */
30004 +                       if (best_guess < 0)
30005 +                               best_guess = irq;
30006 +               }
30007 +       }
30008 +       BUG_ON(best_guess >= NR_IRQS);
30009 +       return best_guess;
30010 +}
30011 +
30012 +/*
30013 + * EISA Edge/Level control register, ELCR
30014 + */
30015 +static int EISA_ELCR(unsigned int irq)
30016 +{
30017 +       if (irq < 16) {
30018 +               unsigned int port = 0x4d0 + (irq >> 3);
30019 +               return (inb(port) >> (irq & 7)) & 1;
30020 +       }
30021 +       apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
30022 +       return 0;
30023 +}
30024 +
30025 +/* EISA interrupts are always polarity zero and can be edge or level
30026 + * trigger depending on the ELCR value.  If an interrupt is listed as
30027 + * EISA conforming in the MP table, that means its trigger type must
30028 + * be read in from the ELCR */
30029 +
30030 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
30031 +#define default_EISA_polarity(idx)     (0)
30032 +
30033 +/* ISA interrupts are always polarity zero edge triggered,
30034 + * when listed as conforming in the MP table. */
30035 +
30036 +#define default_ISA_trigger(idx)       (0)
30037 +#define default_ISA_polarity(idx)      (0)
30038 +
30039 +/* PCI interrupts are always polarity one level triggered,
30040 + * when listed as conforming in the MP table. */
30041 +
30042 +#define default_PCI_trigger(idx)       (1)
30043 +#define default_PCI_polarity(idx)      (1)
30044 +
30045 +/* MCA interrupts are always polarity zero level triggered,
30046 + * when listed as conforming in the MP table. */
30047 +
30048 +#define default_MCA_trigger(idx)       (1)
30049 +#define default_MCA_polarity(idx)      (0)
30050 +
30051 +static int __init MPBIOS_polarity(int idx)
30052 +{
30053 +       int bus = mp_irqs[idx].mpc_srcbus;
30054 +       int polarity;
30055 +
30056 +       /*
30057 +        * Determine IRQ line polarity (high active or low active):
30058 +        */
30059 +       switch (mp_irqs[idx].mpc_irqflag & 3)
30060 +       {
30061 +               case 0: /* conforms, ie. bus-type dependent polarity */
30062 +               {
30063 +                       switch (mp_bus_id_to_type[bus])
30064 +                       {
30065 +                               case MP_BUS_ISA: /* ISA pin */
30066 +                               {
30067 +                                       polarity = default_ISA_polarity(idx);
30068 +                                       break;
30069 +                               }
30070 +                               case MP_BUS_EISA: /* EISA pin */
30071 +                               {
30072 +                                       polarity = default_EISA_polarity(idx);
30073 +                                       break;
30074 +                               }
30075 +                               case MP_BUS_PCI: /* PCI pin */
30076 +                               {
30077 +                                       polarity = default_PCI_polarity(idx);
30078 +                                       break;
30079 +                               }
30080 +                               case MP_BUS_MCA: /* MCA pin */
30081 +                               {
30082 +                                       polarity = default_MCA_polarity(idx);
30083 +                                       break;
30084 +                               }
30085 +                               default:
30086 +                               {
30087 +                                       printk(KERN_WARNING "broken BIOS!!\n");
30088 +                                       polarity = 1;
30089 +                                       break;
30090 +                               }
30091 +                       }
30092 +                       break;
30093 +               }
30094 +               case 1: /* high active */
30095 +               {
30096 +                       polarity = 0;
30097 +                       break;
30098 +               }
30099 +               case 2: /* reserved */
30100 +               {
30101 +                       printk(KERN_WARNING "broken BIOS!!\n");
30102 +                       polarity = 1;
30103 +                       break;
30104 +               }
30105 +               case 3: /* low active */
30106 +               {
30107 +                       polarity = 1;
30108 +                       break;
30109 +               }
30110 +               default: /* invalid */
30111 +               {
30112 +                       printk(KERN_WARNING "broken BIOS!!\n");
30113 +                       polarity = 1;
30114 +                       break;
30115 +               }
30116 +       }
30117 +       return polarity;
30118 +}
30119 +
30120 +static int MPBIOS_trigger(int idx)
30121 +{
30122 +       int bus = mp_irqs[idx].mpc_srcbus;
30123 +       int trigger;
30124 +
30125 +       /*
30126 +        * Determine IRQ trigger mode (edge or level sensitive):
30127 +        */
30128 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
30129 +       {
30130 +               case 0: /* conforms, ie. bus-type dependent */
30131 +               {
30132 +                       switch (mp_bus_id_to_type[bus])
30133 +                       {
30134 +                               case MP_BUS_ISA: /* ISA pin */
30135 +                               {
30136 +                                       trigger = default_ISA_trigger(idx);
30137 +                                       break;
30138 +                               }
30139 +                               case MP_BUS_EISA: /* EISA pin */
30140 +                               {
30141 +                                       trigger = default_EISA_trigger(idx);
30142 +                                       break;
30143 +                               }
30144 +                               case MP_BUS_PCI: /* PCI pin */
30145 +                               {
30146 +                                       trigger = default_PCI_trigger(idx);
30147 +                                       break;
30148 +                               }
30149 +                               case MP_BUS_MCA: /* MCA pin */
30150 +                               {
30151 +                                       trigger = default_MCA_trigger(idx);
30152 +                                       break;
30153 +                               }
30154 +                               default:
30155 +                               {
30156 +                                       printk(KERN_WARNING "broken BIOS!!\n");
30157 +                                       trigger = 1;
30158 +                                       break;
30159 +                               }
30160 +                       }
30161 +                       break;
30162 +               }
30163 +               case 1: /* edge */
30164 +               {
30165 +                       trigger = 0;
30166 +                       break;
30167 +               }
30168 +               case 2: /* reserved */
30169 +               {
30170 +                       printk(KERN_WARNING "broken BIOS!!\n");
30171 +                       trigger = 1;
30172 +                       break;
30173 +               }
30174 +               case 3: /* level */
30175 +               {
30176 +                       trigger = 1;
30177 +                       break;
30178 +               }
30179 +               default: /* invalid */
30180 +               {
30181 +                       printk(KERN_WARNING "broken BIOS!!\n");
30182 +                       trigger = 0;
30183 +                       break;
30184 +               }
30185 +       }
30186 +       return trigger;
30187 +}
30188 +
30189 +static inline int irq_polarity(int idx)
30190 +{
30191 +       return MPBIOS_polarity(idx);
30192 +}
30193 +
30194 +static inline int irq_trigger(int idx)
30195 +{
30196 +       return MPBIOS_trigger(idx);
30197 +}
30198 +
30199 +static int next_irq = 16;
30200 +
30201 +/*
30202 + * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
30203 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
30204 + * from ACPI, which can reach 800 in large boxen.
30205 + *
30206 + * Compact the sparse GSI space into a sequential IRQ series and reuse
30207 + * vectors if possible.
30208 + */
30209 +int gsi_irq_sharing(int gsi)
30210 +{
30211 +       int i, tries, vector;
30212 +
30213 +       BUG_ON(gsi >= NR_IRQ_VECTORS);
30214 +
30215 +       if (platform_legacy_irq(gsi))
30216 +               return gsi;
30217 +
30218 +       if (gsi_2_irq[gsi] != 0xFF)
30219 +               return (int)gsi_2_irq[gsi];
30220 +
30221 +       tries = NR_IRQS;
30222 +  try_again:
30223 +       vector = assign_irq_vector(gsi);
30224 +
30225 +       /*
30226 +        * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
30227 +        * use of vector and if found, return that IRQ.  However, we never want
30228 +        * to share legacy IRQs, which usually have a different trigger mode
30229 +        * than PCI.
30230 +        */
30231 +       for (i = 0; i < NR_IRQS; i++)
30232 +               if (IO_APIC_VECTOR(i) == vector)
30233 +                       break;
30234 +       if (platform_legacy_irq(i)) {
30235 +               if (--tries >= 0) {
30236 +                       IO_APIC_VECTOR(i) = 0;
30237 +                       goto try_again;
30238 +               }
30239 +               panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
30240 +       }
30241 +       if (i < NR_IRQS) {
30242 +               gsi_2_irq[gsi] = i;
30243 +               printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
30244 +                               gsi, vector, i);
30245 +               return i;
30246 +       }
30247 +
30248 +       i = next_irq++;
30249 +       BUG_ON(i >= NR_IRQS);
30250 +       gsi_2_irq[gsi] = i;
30251 +       IO_APIC_VECTOR(i) = vector;
30252 +       printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
30253 +                       gsi, vector, i);
30254 +       return i;
30255 +}
30256 +
30257 +static int pin_2_irq(int idx, int apic, int pin)
30258 +{
30259 +       int irq, i;
30260 +       int bus = mp_irqs[idx].mpc_srcbus;
30261 +
30262 +       /*
30263 +        * Debugging check, we are in big trouble if this message pops up!
30264 +        */
30265 +       if (mp_irqs[idx].mpc_dstirq != pin)
30266 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
30267 +
30268 +       switch (mp_bus_id_to_type[bus])
30269 +       {
30270 +               case MP_BUS_ISA: /* ISA pin */
30271 +               case MP_BUS_EISA:
30272 +               case MP_BUS_MCA:
30273 +               {
30274 +                       irq = mp_irqs[idx].mpc_srcbusirq;
30275 +                       break;
30276 +               }
30277 +               case MP_BUS_PCI: /* PCI pin */
30278 +               {
30279 +                       /*
30280 +                        * PCI IRQs are mapped in order
30281 +                        */
30282 +                       i = irq = 0;
30283 +                       while (i < apic)
30284 +                               irq += nr_ioapic_registers[i++];
30285 +                       irq += pin;
30286 +                       irq = gsi_irq_sharing(irq);
30287 +                       break;
30288 +               }
30289 +               default:
30290 +               {
30291 +                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
30292 +                       irq = 0;
30293 +                       break;
30294 +               }
30295 +       }
30296 +       BUG_ON(irq >= NR_IRQS);
30297 +
30298 +       /*
30299 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
30300 +        */
30301 +       if ((pin >= 16) && (pin <= 23)) {
30302 +               if (pirq_entries[pin-16] != -1) {
30303 +                       if (!pirq_entries[pin-16]) {
30304 +                               apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
30305 +                       } else {
30306 +                               irq = pirq_entries[pin-16];
30307 +                               apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
30308 +                                               pin-16, irq);
30309 +                       }
30310 +               }
30311 +       }
30312 +       BUG_ON(irq >= NR_IRQS);
30313 +       return irq;
30314 +}
30315 +
30316 +static inline int IO_APIC_irq_trigger(int irq)
30317 +{
30318 +       int apic, idx, pin;
30319 +
30320 +       for (apic = 0; apic < nr_ioapics; apic++) {
30321 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
30322 +                       idx = find_irq_entry(apic,pin,mp_INT);
30323 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
30324 +                               return irq_trigger(idx);
30325 +               }
30326 +       }
30327 +       /*
30328 +        * nonexistent IRQs are edge default
30329 +        */
30330 +       return 0;
30331 +}
30332 +
30333 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
30334 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
30335 +
30336 +int assign_irq_vector(int irq)
30337 +{
30338 +       static int current_vector = FIRST_DEVICE_VECTOR;
30339 +       physdev_op_t op;
30340 +  
30341 +       BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
30342 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
30343 +               return IO_APIC_VECTOR(irq);
30344 +
30345 +       op.cmd = PHYSDEVOP_ASSIGN_VECTOR;
30346 +       op.u.irq_op.irq = irq;
30347 +       if (HYPERVISOR_physdev_op(&op))
30348 +               return -ENOSPC;
30349 +       current_vector = op.u.irq_op.vector;
30350 +
30351 +       vector_irq[current_vector] = irq;
30352 +       if (irq != AUTO_ASSIGN)
30353 +               IO_APIC_VECTOR(irq) = current_vector;
30354 +
30355 +       return current_vector;
30356 +}
30357 +
30358 +extern void (*interrupt[NR_IRQS])(void);
30359 +#ifndef CONFIG_XEN
30360 +static struct hw_interrupt_type ioapic_level_type;
30361 +static struct hw_interrupt_type ioapic_edge_type;
30362 +
30363 +#define IOAPIC_AUTO    -1
30364 +#define IOAPIC_EDGE    0
30365 +#define IOAPIC_LEVEL   1
30366 +
30367 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
30368 +{
30369 +       if (use_pci_vector() && !platform_legacy_irq(irq)) {
30370 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
30371 +                               trigger == IOAPIC_LEVEL)
30372 +                       irq_desc[vector].handler = &ioapic_level_type;
30373 +               else
30374 +                       irq_desc[vector].handler = &ioapic_edge_type;
30375 +               set_intr_gate(vector, interrupt[vector]);
30376 +       } else  {
30377 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
30378 +                               trigger == IOAPIC_LEVEL)
30379 +                       irq_desc[irq].handler = &ioapic_level_type;
30380 +               else
30381 +                       irq_desc[irq].handler = &ioapic_edge_type;
30382 +               set_intr_gate(vector, interrupt[irq]);
30383 +       }
30384 +}
30385 +#else
30386 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
30387 +#endif /* !CONFIG_XEN */
30388 +
30389 +static void __init setup_IO_APIC_irqs(void)
30390 +{
30391 +       struct IO_APIC_route_entry entry;
30392 +       int apic, pin, idx, irq, first_notcon = 1, vector;
30393 +       unsigned long flags;
30394 +
30395 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
30396 +
30397 +       for (apic = 0; apic < nr_ioapics; apic++) {
30398 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
30399 +
30400 +               /*
30401 +                * add it to the IO-APIC irq-routing table:
30402 +                */
30403 +               memset(&entry,0,sizeof(entry));
30404 +
30405 +               entry.delivery_mode = INT_DELIVERY_MODE;
30406 +               entry.dest_mode = INT_DEST_MODE;
30407 +               entry.mask = 0;                         /* enable IRQ */
30408 +               entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
30409 +
30410 +               idx = find_irq_entry(apic,pin,mp_INT);
30411 +               if (idx == -1) {
30412 +                       if (first_notcon) {
30413 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
30414 +                               first_notcon = 0;
30415 +                       } else
30416 +                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
30417 +                       continue;
30418 +               }
30419 +
30420 +               entry.trigger = irq_trigger(idx);
30421 +               entry.polarity = irq_polarity(idx);
30422 +
30423 +               if (irq_trigger(idx)) {
30424 +                       entry.trigger = 1;
30425 +                       entry.mask = 1;
30426 +                       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
30427 +               }
30428 +
30429 +               irq = pin_2_irq(idx, apic, pin);
30430 +               add_pin_to_irq(irq, apic, pin);
30431 +
30432 +               if (/* !apic && */ !IO_APIC_IRQ(irq))
30433 +                       continue;
30434 +
30435 +               if (IO_APIC_IRQ(irq)) {
30436 +                       vector = assign_irq_vector(irq);
30437 +                       entry.vector = vector;
30438 +
30439 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
30440 +                       if (!apic && (irq < 16))
30441 +                               disable_8259A_irq(irq);
30442 +               }
30443 +               spin_lock_irqsave(&ioapic_lock, flags);
30444 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
30445 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
30446 +               set_native_irq_info(irq, TARGET_CPUS);
30447 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30448 +       }
30449 +       }
30450 +
30451 +       if (!first_notcon)
30452 +               apic_printk(APIC_VERBOSE," not connected.\n");
30453 +}
30454 +
30455 +#ifndef CONFIG_XEN
30456 +/*
30457 + * Set up the 8259A-master output pin as broadcast to all
30458 + * CPUs.
30459 + */
30460 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
30461 +{
30462 +       struct IO_APIC_route_entry entry;
30463 +       unsigned long flags;
30464 +
30465 +       memset(&entry,0,sizeof(entry));
30466 +
30467 +       disable_8259A_irq(0);
30468 +
30469 +       /* mask LVT0 */
30470 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
30471 +
30472 +       /*
30473 +        * We use logical delivery to get the timer IRQ
30474 +        * to the first CPU.
30475 +        */
30476 +       entry.dest_mode = INT_DEST_MODE;
30477 +       entry.mask = 0;                                 /* unmask IRQ now */
30478 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
30479 +       entry.delivery_mode = INT_DELIVERY_MODE;
30480 +       entry.polarity = 0;
30481 +       entry.trigger = 0;
30482 +       entry.vector = vector;
30483 +
30484 +       /*
30485 +        * The timer IRQ doesn't have to know that behind the
30486 +        * scene we have a 8259A-master in AEOI mode ...
30487 +        */
30488 +       irq_desc[0].handler = &ioapic_edge_type;
30489 +
30490 +       /*
30491 +        * Add it to the IO-APIC irq-routing table:
30492 +        */
30493 +       spin_lock_irqsave(&ioapic_lock, flags);
30494 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
30495 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
30496 +       spin_unlock_irqrestore(&ioapic_lock, flags);
30497 +
30498 +       enable_8259A_irq(0);
30499 +}
30500 +
30501 +void __init UNEXPECTED_IO_APIC(void)
30502 +{
30503 +}
30504 +
30505 +void __apicdebuginit print_IO_APIC(void)
30506 +{
30507 +       int apic, i;
30508 +       union IO_APIC_reg_00 reg_00;
30509 +       union IO_APIC_reg_01 reg_01;
30510 +       union IO_APIC_reg_02 reg_02;
30511 +       unsigned long flags;
30512 +
30513 +       if (apic_verbosity == APIC_QUIET)
30514 +               return;
30515 +
30516 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
30517 +       for (i = 0; i < nr_ioapics; i++)
30518 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
30519 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
30520 +
30521 +       /*
30522 +        * We are a bit conservative about what we expect.  We have to
30523 +        * know about every hardware change ASAP.
30524 +        */
30525 +       printk(KERN_INFO "testing the IO APIC.......................\n");
30526 +
30527 +       for (apic = 0; apic < nr_ioapics; apic++) {
30528 +
30529 +       spin_lock_irqsave(&ioapic_lock, flags);
30530 +       reg_00.raw = io_apic_read(apic, 0);
30531 +       reg_01.raw = io_apic_read(apic, 1);
30532 +       if (reg_01.bits.version >= 0x10)
30533 +               reg_02.raw = io_apic_read(apic, 2);
30534 +       spin_unlock_irqrestore(&ioapic_lock, flags);
30535 +
30536 +       printk("\n");
30537 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
30538 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
30539 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
30540 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
30541 +               UNEXPECTED_IO_APIC();
30542 +
30543 +       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
30544 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
30545 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
30546 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
30547 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
30548 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
30549 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
30550 +               (reg_01.bits.entries != 0x2E) &&
30551 +               (reg_01.bits.entries != 0x3F) &&
30552 +               (reg_01.bits.entries != 0x03) 
30553 +       )
30554 +               UNEXPECTED_IO_APIC();
30555 +
30556 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
30557 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
30558 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
30559 +               (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
30560 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
30561 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
30562 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
30563 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
30564 +       )
30565 +               UNEXPECTED_IO_APIC();
30566 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
30567 +               UNEXPECTED_IO_APIC();
30568 +
30569 +       if (reg_01.bits.version >= 0x10) {
30570 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
30571 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
30572 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
30573 +                       UNEXPECTED_IO_APIC();
30574 +       }
30575 +
30576 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
30577 +
30578 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
30579 +                         " Stat Dest Deli Vect:   \n");
30580 +
30581 +       for (i = 0; i <= reg_01.bits.entries; i++) {
30582 +               struct IO_APIC_route_entry entry;
30583 +
30584 +               spin_lock_irqsave(&ioapic_lock, flags);
30585 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
30586 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
30587 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30588 +
30589 +               printk(KERN_DEBUG " %02x %03X %02X  ",
30590 +                       i,
30591 +                       entry.dest.logical.logical_dest,
30592 +                       entry.dest.physical.physical_dest
30593 +               );
30594 +
30595 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
30596 +                       entry.mask,
30597 +                       entry.trigger,
30598 +                       entry.irr,
30599 +                       entry.polarity,
30600 +                       entry.delivery_status,
30601 +                       entry.dest_mode,
30602 +                       entry.delivery_mode,
30603 +                       entry.vector
30604 +               );
30605 +       }
30606 +       }
30607 +       if (use_pci_vector())
30608 +               printk(KERN_INFO "Using vector-based indexing\n");
30609 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
30610 +       for (i = 0; i < NR_IRQS; i++) {
30611 +               struct irq_pin_list *entry = irq_2_pin + i;
30612 +               if (entry->pin < 0)
30613 +                       continue;
30614 +               if (use_pci_vector() && !platform_legacy_irq(i))
30615 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
30616 +               else
30617 +                       printk(KERN_DEBUG "IRQ%d ", i);
30618 +               for (;;) {
30619 +                       printk("-> %d:%d", entry->apic, entry->pin);
30620 +                       if (!entry->next)
30621 +                               break;
30622 +                       entry = irq_2_pin + entry->next;
30623 +               }
30624 +               printk("\n");
30625 +       }
30626 +
30627 +       printk(KERN_INFO ".................................... done.\n");
30628 +
30629 +       return;
30630 +}
30631 +
30632 +#if 0
30633 +
30634 +static __apicdebuginit void print_APIC_bitfield (int base)
30635 +{
30636 +       unsigned int v;
30637 +       int i, j;
30638 +
30639 +       if (apic_verbosity == APIC_QUIET)
30640 +               return;
30641 +
30642 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
30643 +       for (i = 0; i < 8; i++) {
30644 +               v = apic_read(base + i*0x10);
30645 +               for (j = 0; j < 32; j++) {
30646 +                       if (v & (1<<j))
30647 +                               printk("1");
30648 +                       else
30649 +                               printk("0");
30650 +               }
30651 +               printk("\n");
30652 +       }
30653 +}
30654 +
30655 +void __apicdebuginit print_local_APIC(void * dummy)
30656 +{
30657 +       unsigned int v, ver, maxlvt;
30658 +
30659 +       if (apic_verbosity == APIC_QUIET)
30660 +               return;
30661 +
30662 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
30663 +               smp_processor_id(), hard_smp_processor_id());
30664 +       v = apic_read(APIC_ID);
30665 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
30666 +       v = apic_read(APIC_LVR);
30667 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
30668 +       ver = GET_APIC_VERSION(v);
30669 +       maxlvt = get_maxlvt();
30670 +
30671 +       v = apic_read(APIC_TASKPRI);
30672 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
30673 +
30674 +       v = apic_read(APIC_ARBPRI);
30675 +       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
30676 +               v & APIC_ARBPRI_MASK);
30677 +       v = apic_read(APIC_PROCPRI);
30678 +       printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
30679 +
30680 +       v = apic_read(APIC_EOI);
30681 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
30682 +       v = apic_read(APIC_RRR);
30683 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
30684 +       v = apic_read(APIC_LDR);
30685 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
30686 +       v = apic_read(APIC_DFR);
30687 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
30688 +       v = apic_read(APIC_SPIV);
30689 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
30690 +
30691 +       printk(KERN_DEBUG "... APIC ISR field:\n");
30692 +       print_APIC_bitfield(APIC_ISR);
30693 +       printk(KERN_DEBUG "... APIC TMR field:\n");
30694 +       print_APIC_bitfield(APIC_TMR);
30695 +       printk(KERN_DEBUG "... APIC IRR field:\n");
30696 +       print_APIC_bitfield(APIC_IRR);
30697 +
30698 +       v = apic_read(APIC_ESR);
30699 +       printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
30700 +
30701 +       v = apic_read(APIC_ICR);
30702 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
30703 +       v = apic_read(APIC_ICR2);
30704 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
30705 +
30706 +       v = apic_read(APIC_LVTT);
30707 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
30708 +
30709 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
30710 +               v = apic_read(APIC_LVTPC);
30711 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
30712 +       }
30713 +       v = apic_read(APIC_LVT0);
30714 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
30715 +       v = apic_read(APIC_LVT1);
30716 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
30717 +
30718 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
30719 +               v = apic_read(APIC_LVTERR);
30720 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
30721 +       }
30722 +
30723 +       v = apic_read(APIC_TMICT);
30724 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
30725 +       v = apic_read(APIC_TMCCT);
30726 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
30727 +       v = apic_read(APIC_TDCR);
30728 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
30729 +       printk("\n");
30730 +}
30731 +
30732 +void print_all_local_APICs (void)
30733 +{
30734 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
30735 +}
30736 +
30737 +void __apicdebuginit print_PIC(void)
30738 +{
30739 +       unsigned int v;
30740 +       unsigned long flags;
30741 +
30742 +       if (apic_verbosity == APIC_QUIET)
30743 +               return;
30744 +
30745 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
30746 +
30747 +       spin_lock_irqsave(&i8259A_lock, flags);
30748 +
30749 +       v = inb(0xa1) << 8 | inb(0x21);
30750 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
30751 +
30752 +       v = inb(0xa0) << 8 | inb(0x20);
30753 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
30754 +
30755 +       outb(0x0b,0xa0);
30756 +       outb(0x0b,0x20);
30757 +       v = inb(0xa0) << 8 | inb(0x20);
30758 +       outb(0x0a,0xa0);
30759 +       outb(0x0a,0x20);
30760 +
30761 +       spin_unlock_irqrestore(&i8259A_lock, flags);
30762 +
30763 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
30764 +
30765 +       v = inb(0x4d1) << 8 | inb(0x4d0);
30766 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
30767 +}
30768 +
30769 +#endif  /*  0  */
30770 +
30771 +#else
30772 +void __init print_IO_APIC(void) { }
30773 +#endif /* !CONFIG_XEN */
30774 +
30775 +static void __init enable_IO_APIC(void)
30776 +{
30777 +       union IO_APIC_reg_01 reg_01;
30778 +#ifndef CONFIG_XEN
30779 +       int i8259_apic, i8259_pin;
30780 +#endif
30781 +       int i, apic;
30782 +       unsigned long flags;
30783 +
30784 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
30785 +               irq_2_pin[i].pin = -1;
30786 +               irq_2_pin[i].next = 0;
30787 +       }
30788 +       if (!pirqs_enabled)
30789 +               for (i = 0; i < MAX_PIRQS; i++)
30790 +                       pirq_entries[i] = -1;
30791 +
30792 +       /*
30793 +        * The number of IO-APIC IRQ registers (== #pins):
30794 +        */
30795 +       for (apic = 0; apic < nr_ioapics; apic++) {
30796 +               spin_lock_irqsave(&ioapic_lock, flags);
30797 +               reg_01.raw = io_apic_read(apic, 1);
30798 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30799 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
30800 +       }
30801 +#ifndef CONFIG_XEN
30802 +       for(apic = 0; apic < nr_ioapics; apic++) {
30803 +               int pin;
30804 +               /* See if any of the pins is in ExtINT mode */
30805 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
30806 +                       struct IO_APIC_route_entry entry;
30807 +                       spin_lock_irqsave(&ioapic_lock, flags);
30808 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
30809 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
30810 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
30811 +
30812 +
30813 +                       /* If the interrupt line is enabled and in ExtInt mode
30814 +                        * I have found the pin where the i8259 is connected.
30815 +                        */
30816 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
30817 +                               ioapic_i8259.apic = apic;
30818 +                               ioapic_i8259.pin  = pin;
30819 +                               goto found_i8259;
30820 +                       }
30821 +               }
30822 +       }
30823 + found_i8259:
30824 +       /* Look to see what if the MP table has reported the ExtINT */
30825 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
30826 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
30827 +       /* Trust the MP table if nothing is setup in the hardware */
30828 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
30829 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
30830 +               ioapic_i8259.pin  = i8259_pin;
30831 +               ioapic_i8259.apic = i8259_apic;
30832 +       }
30833 +       /* Complain if the MP table and the hardware disagree */
30834 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
30835 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
30836 +       {
30837 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
30838 +       }
30839 +#endif
30840 +
30841 +       /*
30842 +        * Do not trust the IO-APIC being empty at bootup
30843 +        */
30844 +       clear_IO_APIC();
30845 +}
30846 +
30847 +/*
30848 + * Not an __init, needed by the reboot code
30849 + */
30850 +void disable_IO_APIC(void)
30851 +{
30852 +       /*
30853 +        * Clear the IO-APIC before rebooting:
30854 +        */
30855 +       clear_IO_APIC();
30856 +
30857 +#ifndef CONFIG_XEN
30858 +       /*
30859 +        * If the i8259 is routed through an IOAPIC
30860 +        * Put that IOAPIC in virtual wire mode
30861 +        * so legacy interrupts can be delivered.
30862 +        */
30863 +       if (ioapic_i8259.pin != -1) {
30864 +               struct IO_APIC_route_entry entry;
30865 +               unsigned long flags;
30866 +
30867 +               memset(&entry, 0, sizeof(entry));
30868 +               entry.mask            = 0; /* Enabled */
30869 +               entry.trigger         = 0; /* Edge */
30870 +               entry.irr             = 0;
30871 +               entry.polarity        = 0; /* High */
30872 +               entry.delivery_status = 0;
30873 +               entry.dest_mode       = 0; /* Physical */
30874 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
30875 +               entry.vector          = 0;
30876 +               entry.dest.physical.physical_dest =
30877 +                                       GET_APIC_ID(apic_read(APIC_ID));
30878 +
30879 +               /*
30880 +                * Add it to the IO-APIC irq-routing table:
30881 +                */
30882 +               spin_lock_irqsave(&ioapic_lock, flags);
30883 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
30884 +                       *(((int *)&entry)+1));
30885 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
30886 +                       *(((int *)&entry)+0));
30887 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30888 +       }
30889 +
30890 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
30891 +#endif
30892 +}
30893 +
30894 +/*
30895 + * function to set the IO-APIC physical IDs based on the
30896 + * values stored in the MPC table.
30897 + *
30898 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
30899 + */
30900 +
30901 +#ifndef CONFIG_XEN
30902 +static void __init setup_ioapic_ids_from_mpc (void)
30903 +{
30904 +       union IO_APIC_reg_00 reg_00;
30905 +       int apic;
30906 +       int i;
30907 +       unsigned char old_id;
30908 +       unsigned long flags;
30909 +
30910 +       /*
30911 +        * Set the IOAPIC ID to the value stored in the MPC table.
30912 +        */
30913 +       for (apic = 0; apic < nr_ioapics; apic++) {
30914 +
30915 +               /* Read the register 0 value */
30916 +               spin_lock_irqsave(&ioapic_lock, flags);
30917 +               reg_00.raw = io_apic_read(apic, 0);
30918 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30919 +               
30920 +               old_id = mp_ioapics[apic].mpc_apicid;
30921 +
30922 +
30923 +               printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
30924 +
30925 +
30926 +               /*
30927 +                * We need to adjust the IRQ routing table
30928 +                * if the ID changed.
30929 +                */
30930 +               if (old_id != mp_ioapics[apic].mpc_apicid)
30931 +                       for (i = 0; i < mp_irq_entries; i++)
30932 +                               if (mp_irqs[i].mpc_dstapic == old_id)
30933 +                                       mp_irqs[i].mpc_dstapic
30934 +                                               = mp_ioapics[apic].mpc_apicid;
30935 +
30936 +               /*
30937 +                * Read the right value from the MPC table and
30938 +                * write it into the ID register.
30939 +                */
30940 +               apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
30941 +                               mp_ioapics[apic].mpc_apicid);
30942 +
30943 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
30944 +               spin_lock_irqsave(&ioapic_lock, flags);
30945 +               io_apic_write(apic, 0, reg_00.raw);
30946 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30947 +
30948 +               /*
30949 +                * Sanity check
30950 +                */
30951 +               spin_lock_irqsave(&ioapic_lock, flags);
30952 +               reg_00.raw = io_apic_read(apic, 0);
30953 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30954 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
30955 +                       printk("could not set ID!\n");
30956 +               else
30957 +                       apic_printk(APIC_VERBOSE," ok.\n");
30958 +       }
30959 +}
30960 +#else
30961 +static void __init setup_ioapic_ids_from_mpc(void) { }
30962 +#endif
30963 +
30964 +/*
30965 + * There is a nasty bug in some older SMP boards, their mptable lies
30966 + * about the timer IRQ. We do the following to work around the situation:
30967 + *
30968 + *     - timer IRQ defaults to IO-APIC IRQ
30969 + *     - if this function detects that timer IRQs are defunct, then we fall
30970 + *       back to ISA timer IRQs
30971 + */
30972 +#ifndef CONFIG_XEN
30973 +static int __init timer_irq_works(void)
30974 +{
30975 +       unsigned long t1 = jiffies;
30976 +
30977 +       local_irq_enable();
30978 +       /* Let ten ticks pass... */
30979 +       mdelay((10 * 1000) / HZ);
30980 +
30981 +       /*
30982 +        * Expect a few ticks at least, to be sure some possible
30983 +        * glue logic does not lock up after one or two first
30984 +        * ticks in a non-ExtINT mode.  Also the local APIC
30985 +        * might have cached one ExtINT interrupt.  Finally, at
30986 +        * least one tick may be lost due to delays.
30987 +        */
30988 +
30989 +       /* jiffies wrap? */
30990 +       if (jiffies - t1 > 4)
30991 +               return 1;
30992 +       return 0;
30993 +}
30994 +
30995 +/*
30996 + * In the SMP+IOAPIC case it might happen that there are an unspecified
30997 + * number of pending IRQ events unhandled. These cases are very rare,
30998 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
30999 + * better to do it this way as thus we do not have to be aware of
31000 + * 'pending' interrupts in the IRQ path, except at this point.
31001 + */
31002 +/*
31003 + * Edge triggered needs to resend any interrupt
31004 + * that was delayed but this is now handled in the device
31005 + * independent code.
31006 + */
31007 +
31008 +/*
31009 + * Starting up a edge-triggered IO-APIC interrupt is
31010 + * nasty - we need to make sure that we get the edge.
31011 + * If it is already asserted for some reason, we need
31012 + * return 1 to indicate that is was pending.
31013 + *
31014 + * This is not complete - we should be able to fake
31015 + * an edge even if it isn't on the 8259A...
31016 + */
31017 +
31018 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
31019 +{
31020 +       int was_pending = 0;
31021 +       unsigned long flags;
31022 +
31023 +       spin_lock_irqsave(&ioapic_lock, flags);
31024 +       if (irq < 16) {
31025 +               disable_8259A_irq(irq);
31026 +               if (i8259A_irq_pending(irq))
31027 +                       was_pending = 1;
31028 +       }
31029 +       __unmask_IO_APIC_irq(irq);
31030 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31031 +
31032 +       return was_pending;
31033 +}
31034 +
31035 +/*
31036 + * Once we have recorded IRQ_PENDING already, we can mask the
31037 + * interrupt for real. This prevents IRQ storms from unhandled
31038 + * devices.
31039 + */
31040 +static void ack_edge_ioapic_irq(unsigned int irq)
31041 +{
31042 +       move_irq(irq);
31043 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
31044 +                                       == (IRQ_PENDING | IRQ_DISABLED))
31045 +               mask_IO_APIC_irq(irq);
31046 +       ack_APIC_irq();
31047 +}
31048 +
31049 +/*
31050 + * Level triggered interrupts can just be masked,
31051 + * and shutting down and starting up the interrupt
31052 + * is the same as enabling and disabling them -- except
31053 + * with a startup need to return a "was pending" value.
31054 + *
31055 + * Level triggered interrupts are special because we
31056 + * do not touch any IO-APIC register while handling
31057 + * them. We ack the APIC in the end-IRQ handler, not
31058 + * in the start-IRQ-handler. Protection against reentrance
31059 + * from the same interrupt is still provided, both by the
31060 + * generic IRQ layer and by the fact that an unacked local
31061 + * APIC does not accept IRQs.
31062 + */
31063 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
31064 +{
31065 +       unmask_IO_APIC_irq(irq);
31066 +
31067 +       return 0; /* don't check for pending */
31068 +}
31069 +
31070 +static void end_level_ioapic_irq (unsigned int irq)
31071 +{
31072 +       move_irq(irq);
31073 +       ack_APIC_irq();
31074 +}
31075 +
31076 +#ifdef CONFIG_PCI_MSI
31077 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
31078 +{
31079 +       int irq = vector_to_irq(vector);
31080 +
31081 +       return startup_edge_ioapic_irq(irq);
31082 +}
31083 +
31084 +static void ack_edge_ioapic_vector(unsigned int vector)
31085 +{
31086 +       int irq = vector_to_irq(vector);
31087 +
31088 +       move_native_irq(vector);
31089 +       ack_edge_ioapic_irq(irq);
31090 +}
31091 +
31092 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
31093 +{
31094 +       int irq = vector_to_irq(vector);
31095 +
31096 +       return startup_level_ioapic_irq (irq);
31097 +}
31098 +
31099 +static void end_level_ioapic_vector (unsigned int vector)
31100 +{
31101 +       int irq = vector_to_irq(vector);
31102 +
31103 +       move_native_irq(vector);
31104 +       end_level_ioapic_irq(irq);
31105 +}
31106 +
31107 +static void mask_IO_APIC_vector (unsigned int vector)
31108 +{
31109 +       int irq = vector_to_irq(vector);
31110 +
31111 +       mask_IO_APIC_irq(irq);
31112 +}
31113 +
31114 +static void unmask_IO_APIC_vector (unsigned int vector)
31115 +{
31116 +       int irq = vector_to_irq(vector);
31117 +
31118 +       unmask_IO_APIC_irq(irq);
31119 +}
31120 +
31121 +#ifdef CONFIG_SMP
31122 +static void set_ioapic_affinity_vector (unsigned int vector,
31123 +                                       cpumask_t cpu_mask)
31124 +{
31125 +       int irq = vector_to_irq(vector);
31126 +
31127 +       set_native_irq_info(vector, cpu_mask);
31128 +       set_ioapic_affinity_irq(irq, cpu_mask);
31129 +}
31130 +#endif // CONFIG_SMP
31131 +#endif // CONFIG_PCI_MSI
31132 +
31133 +/*
31134 + * Level and edge triggered IO-APIC interrupts need different handling,
31135 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
31136 + * handled with the level-triggered descriptor, but that one has slightly
31137 + * more overhead. Level-triggered interrupts cannot be handled with the
31138 + * edge-triggered handler, without risking IRQ storms and other ugly
31139 + * races.
31140 + */
31141 +
31142 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
31143 +       .typename = "IO-APIC-edge",
31144 +       .startup        = startup_edge_ioapic,
31145 +       .shutdown       = shutdown_edge_ioapic,
31146 +       .enable         = enable_edge_ioapic,
31147 +       .disable        = disable_edge_ioapic,
31148 +       .ack            = ack_edge_ioapic,
31149 +       .end            = end_edge_ioapic,
31150 +#ifdef CONFIG_SMP
31151 +       .set_affinity = set_ioapic_affinity,
31152 +#endif
31153 +};
31154 +
31155 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
31156 +       .typename = "IO-APIC-level",
31157 +       .startup        = startup_level_ioapic,
31158 +       .shutdown       = shutdown_level_ioapic,
31159 +       .enable         = enable_level_ioapic,
31160 +       .disable        = disable_level_ioapic,
31161 +       .ack            = mask_and_ack_level_ioapic,
31162 +       .end            = end_level_ioapic,
31163 +#ifdef CONFIG_SMP
31164 +       .set_affinity = set_ioapic_affinity,
31165 +#endif
31166 +};
31167 +#endif /* !CONFIG_XEN */
31168 +
31169 +static inline void init_IO_APIC_traps(void)
31170 +{
31171 +       int irq;
31172 +
31173 +       /*
31174 +        * NOTE! The local APIC isn't very good at handling
31175 +        * multiple interrupts at the same interrupt level.
31176 +        * As the interrupt level is determined by taking the
31177 +        * vector number and shifting that right by 4, we
31178 +        * want to spread these out a bit so that they don't
31179 +        * all fall in the same interrupt level.
31180 +        *
31181 +        * Also, we've got to be careful not to trash gate
31182 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
31183 +        */
31184 +       for (irq = 0; irq < NR_IRQS ; irq++) {
31185 +               int tmp = irq;
31186 +               if (use_pci_vector()) {
31187 +                       if (!platform_legacy_irq(tmp))
31188 +                               if ((tmp = vector_to_irq(tmp)) == -1)
31189 +                                       continue;
31190 +               }
31191 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
31192 +                       /*
31193 +                        * Hmm.. We don't have an entry for this,
31194 +                        * so default to an old-fashioned 8259
31195 +                        * interrupt if we can..
31196 +                        */
31197 +                       if (irq < 16)
31198 +                               make_8259A_irq(irq);
31199 +#ifndef CONFIG_XEN
31200 +                       else
31201 +                               /* Strange. Oh, well.. */
31202 +                               irq_desc[irq].handler = &no_irq_type;
31203 +#endif
31204 +               }
31205 +       }
31206 +}
31207 +
31208 +#ifndef CONFIG_XEN
31209 +static void enable_lapic_irq (unsigned int irq)
31210 +{
31211 +       unsigned long v;
31212 +
31213 +       v = apic_read(APIC_LVT0);
31214 +       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
31215 +}
31216 +
31217 +static void disable_lapic_irq (unsigned int irq)
31218 +{
31219 +       unsigned long v;
31220 +
31221 +       v = apic_read(APIC_LVT0);
31222 +       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
31223 +}
31224 +
31225 +static void ack_lapic_irq (unsigned int irq)
31226 +{
31227 +       ack_APIC_irq();
31228 +}
31229 +
31230 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
31231 +
31232 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
31233 +       .typename = "local-APIC-edge",
31234 +       .startup = NULL, /* startup_irq() not used for IRQ0 */
31235 +       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
31236 +       .enable = enable_lapic_irq,
31237 +       .disable = disable_lapic_irq,
31238 +       .ack = ack_lapic_irq,
31239 +       .end = end_lapic_irq,
31240 +};
31241 +
31242 +static void setup_nmi (void)
31243 +{
31244 +       /*
31245 +        * Dirty trick to enable the NMI watchdog ...
31246 +        * We put the 8259A master into AEOI mode and
31247 +        * unmask on all local APICs LVT0 as NMI.
31248 +        *
31249 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
31250 +        * is from Maciej W. Rozycki - so we do not have to EOI from
31251 +        * the NMI handler or the timer interrupt.
31252 +        */ 
31253 +       printk(KERN_INFO "activating NMI Watchdog ...");
31254 +
31255 +       enable_NMI_through_LVT0(NULL);
31256 +
31257 +       printk(" done.\n");
31258 +}
31259 +
31260 +/*
31261 + * This looks a bit hackish but it's about the only one way of sending
31262 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
31263 + * not support the ExtINT mode, unfortunately.  We need to send these
31264 + * cycles as some i82489DX-based boards have glue logic that keeps the
31265 + * 8259A interrupt line asserted until INTA.  --macro
31266 + */
31267 +static inline void unlock_ExtINT_logic(void)
31268 +{
31269 +       int apic, pin, i;
31270 +       struct IO_APIC_route_entry entry0, entry1;
31271 +       unsigned char save_control, save_freq_select;
31272 +       unsigned long flags;
31273 +
31274 +       pin  = find_isa_irq_pin(8, mp_INT);
31275 +       apic = find_isa_irq_apic(8, mp_INT);
31276 +       if (pin == -1)
31277 +               return;
31278 +
31279 +       spin_lock_irqsave(&ioapic_lock, flags);
31280 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
31281 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
31282 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31283 +       clear_IO_APIC_pin(apic, pin);
31284 +
31285 +       memset(&entry1, 0, sizeof(entry1));
31286 +
31287 +       entry1.dest_mode = 0;                   /* physical delivery */
31288 +       entry1.mask = 0;                        /* unmask IRQ now */
31289 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
31290 +       entry1.delivery_mode = dest_ExtINT;
31291 +       entry1.polarity = entry0.polarity;
31292 +       entry1.trigger = 0;
31293 +       entry1.vector = 0;
31294 +
31295 +       spin_lock_irqsave(&ioapic_lock, flags);
31296 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
31297 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
31298 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31299 +
31300 +       save_control = CMOS_READ(RTC_CONTROL);
31301 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
31302 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
31303 +                  RTC_FREQ_SELECT);
31304 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
31305 +
31306 +       i = 100;
31307 +       while (i-- > 0) {
31308 +               mdelay(10);
31309 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
31310 +                       i -= 10;
31311 +       }
31312 +
31313 +       CMOS_WRITE(save_control, RTC_CONTROL);
31314 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
31315 +       clear_IO_APIC_pin(apic, pin);
31316 +
31317 +       spin_lock_irqsave(&ioapic_lock, flags);
31318 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
31319 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
31320 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31321 +}
31322 +
31323 +/*
31324 + * This code may look a bit paranoid, but it's supposed to cooperate with
31325 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
31326 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
31327 + * fanatically on his truly buggy board.
31328 + *
31329 + * FIXME: really need to revamp this for modern platforms only.
31330 + */
31331 +static inline void check_timer(void)
31332 +{
31333 +       int apic1, pin1, apic2, pin2;
31334 +       int vector;
31335 +
31336 +       /*
31337 +        * get/set the timer IRQ vector:
31338 +        */
31339 +       disable_8259A_irq(0);
31340 +       vector = assign_irq_vector(0);
31341 +       set_intr_gate(vector, interrupt[0]);
31342 +
31343 +       /*
31344 +        * Subtle, code in do_timer_interrupt() expects an AEOI
31345 +        * mode for the 8259A whenever interrupts are routed
31346 +        * through I/O APICs.  Also IRQ0 has to be enabled in
31347 +        * the 8259A which implies the virtual wire has to be
31348 +        * disabled in the local APIC.
31349 +        */
31350 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
31351 +       init_8259A(1);
31352 +       if (timer_over_8254 > 0)
31353 +               enable_8259A_irq(0);
31354 +
31355 +       pin1  = find_isa_irq_pin(0, mp_INT);
31356 +       apic1 = find_isa_irq_apic(0, mp_INT);
31357 +       pin2  = ioapic_i8259.pin;
31358 +       apic2 = ioapic_i8259.apic;
31359 +
31360 +       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
31361 +               vector, apic1, pin1, apic2, pin2);
31362 +
31363 +       if (pin1 != -1) {
31364 +               /*
31365 +                * Ok, does IRQ0 through the IOAPIC work?
31366 +                */
31367 +               unmask_IO_APIC_irq(0);
31368 +               if (!no_timer_check && timer_irq_works()) {
31369 +                       nmi_watchdog_default();
31370 +                       if (nmi_watchdog == NMI_IO_APIC) {
31371 +                               disable_8259A_irq(0);
31372 +                               setup_nmi();
31373 +                               enable_8259A_irq(0);
31374 +                       }
31375 +                       if (disable_timer_pin_1 > 0)
31376 +                               clear_IO_APIC_pin(0, pin1);
31377 +                       return;
31378 +               }
31379 +               clear_IO_APIC_pin(apic1, pin1);
31380 +               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
31381 +                               "connected to IO-APIC\n");
31382 +       }
31383 +
31384 +       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
31385 +                               "through the 8259A ... ");
31386 +       if (pin2 != -1) {
31387 +               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
31388 +                       apic2, pin2);
31389 +               /*
31390 +                * legacy devices should be connected to IO APIC #0
31391 +                */
31392 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
31393 +               if (timer_irq_works()) {
31394 +                       printk("works.\n");
31395 +                       nmi_watchdog_default();
31396 +                       if (nmi_watchdog == NMI_IO_APIC) {
31397 +                               setup_nmi();
31398 +                       }
31399 +                       return;
31400 +               }
31401 +               /*
31402 +                * Cleanup, just in case ...
31403 +                */
31404 +               clear_IO_APIC_pin(apic2, pin2);
31405 +       }
31406 +       printk(" failed.\n");
31407 +
31408 +       if (nmi_watchdog == NMI_IO_APIC) {
31409 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
31410 +               nmi_watchdog = 0;
31411 +       }
31412 +
31413 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
31414 +
31415 +       disable_8259A_irq(0);
31416 +       irq_desc[0].handler = &lapic_irq_type;
31417 +       apic_write(APIC_LVT0, APIC_DM_FIXED | vector);  /* Fixed mode */
31418 +       enable_8259A_irq(0);
31419 +
31420 +       if (timer_irq_works()) {
31421 +               apic_printk(APIC_QUIET, " works.\n");
31422 +               return;
31423 +       }
31424 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
31425 +       apic_printk(APIC_VERBOSE," failed.\n");
31426 +
31427 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
31428 +
31429 +       init_8259A(0);
31430 +       make_8259A_irq(0);
31431 +       apic_write(APIC_LVT0, APIC_DM_EXTINT);
31432 +
31433 +       unlock_ExtINT_logic();
31434 +
31435 +       if (timer_irq_works()) {
31436 +               apic_printk(APIC_VERBOSE," works.\n");
31437 +               return;
31438 +       }
31439 +       apic_printk(APIC_VERBOSE," failed :(.\n");
31440 +       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
31441 +}
31442 +#else
31443 +#define check_timer() ((void)0)
31444 +#endif /* !CONFIG_XEN */
31445 +
31446 +static int __init notimercheck(char *s)
31447 +{
31448 +       no_timer_check = 1;
31449 +       return 1;
31450 +}
31451 +__setup("no_timer_check", notimercheck);
31452 +
31453 +/*
31454 + *
31455 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
31456 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
31457 + *   Linux doesn't really care, as it's not actually used
31458 + *   for any interrupt handling anyway.
31459 + */
31460 +#define PIC_IRQS       (1<<2)
31461 +
31462 +void __init setup_IO_APIC(void)
31463 +{
31464 +       enable_IO_APIC();
31465 +
31466 +       if (acpi_ioapic)
31467 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
31468 +       else
31469 +               io_apic_irqs = ~PIC_IRQS;
31470 +
31471 +       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
31472 +
31473 +       /*
31474 +        * Set up the IO-APIC IRQ routing table.
31475 +        */
31476 +       if (!acpi_ioapic)
31477 +               setup_ioapic_ids_from_mpc();
31478 +#ifndef CONFIG_XEN
31479 +       sync_Arb_IDs();
31480 +#endif /* !CONFIG_XEN */
31481 +       setup_IO_APIC_irqs();
31482 +       init_IO_APIC_traps();
31483 +       check_timer();
31484 +       if (!acpi_ioapic)
31485 +               print_IO_APIC();
31486 +}
31487 +
31488 +struct sysfs_ioapic_data {
31489 +       struct sys_device dev;
31490 +       struct IO_APIC_route_entry entry[0];
31491 +};
31492 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
31493 +
31494 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
31495 +{
31496 +       struct IO_APIC_route_entry *entry;
31497 +       struct sysfs_ioapic_data *data;
31498 +       unsigned long flags;
31499 +       int i;
31500 +
31501 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
31502 +       entry = data->entry;
31503 +       spin_lock_irqsave(&ioapic_lock, flags);
31504 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
31505 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
31506 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
31507 +       }
31508 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31509 +
31510 +       return 0;
31511 +}
31512 +
31513 +static int ioapic_resume(struct sys_device *dev)
31514 +{
31515 +       struct IO_APIC_route_entry *entry;
31516 +       struct sysfs_ioapic_data *data;
31517 +       unsigned long flags;
31518 +       union IO_APIC_reg_00 reg_00;
31519 +       int i;
31520 +
31521 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
31522 +       entry = data->entry;
31523 +
31524 +       spin_lock_irqsave(&ioapic_lock, flags);
31525 +       reg_00.raw = io_apic_read(dev->id, 0);
31526 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
31527 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
31528 +               io_apic_write(dev->id, 0, reg_00.raw);
31529 +       }
31530 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
31531 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
31532 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
31533 +       }
31534 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31535 +
31536 +       return 0;
31537 +}
31538 +
31539 +static struct sysdev_class ioapic_sysdev_class = {
31540 +       set_kset_name("ioapic"),
31541 +       .suspend = ioapic_suspend,
31542 +       .resume = ioapic_resume,
31543 +};
31544 +
31545 +static int __init ioapic_init_sysfs(void)
31546 +{
31547 +       struct sys_device * dev;
31548 +       int i, size, error = 0;
31549 +
31550 +       error = sysdev_class_register(&ioapic_sysdev_class);
31551 +       if (error)
31552 +               return error;
31553 +
31554 +       for (i = 0; i < nr_ioapics; i++ ) {
31555 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
31556 +                       * sizeof(struct IO_APIC_route_entry);
31557 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
31558 +               if (!mp_ioapic_data[i]) {
31559 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
31560 +                       continue;
31561 +               }
31562 +               memset(mp_ioapic_data[i], 0, size);
31563 +               dev = &mp_ioapic_data[i]->dev;
31564 +               dev->id = i;
31565 +               dev->cls = &ioapic_sysdev_class;
31566 +               error = sysdev_register(dev);
31567 +               if (error) {
31568 +                       kfree(mp_ioapic_data[i]);
31569 +                       mp_ioapic_data[i] = NULL;
31570 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
31571 +                       continue;
31572 +               }
31573 +       }
31574 +
31575 +       return 0;
31576 +}
31577 +
31578 +device_initcall(ioapic_init_sysfs);
31579 +
31580 +/* --------------------------------------------------------------------------
31581 +                          ACPI-based IOAPIC Configuration
31582 +   -------------------------------------------------------------------------- */
31583 +
31584 +#ifdef CONFIG_ACPI
31585 +
31586 +#define IO_APIC_MAX_ID         0xFE
31587 +
31588 +int __init io_apic_get_version (int ioapic)
31589 +{
31590 +       union IO_APIC_reg_01    reg_01;
31591 +       unsigned long flags;
31592 +
31593 +       spin_lock_irqsave(&ioapic_lock, flags);
31594 +       reg_01.raw = io_apic_read(ioapic, 1);
31595 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31596 +
31597 +       return reg_01.bits.version;
31598 +}
31599 +
31600 +
31601 +int __init io_apic_get_redir_entries (int ioapic)
31602 +{
31603 +       union IO_APIC_reg_01    reg_01;
31604 +       unsigned long flags;
31605 +
31606 +       spin_lock_irqsave(&ioapic_lock, flags);
31607 +       reg_01.raw = io_apic_read(ioapic, 1);
31608 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31609 +
31610 +       return reg_01.bits.entries;
31611 +}
31612 +
31613 +
31614 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
31615 +{
31616 +       struct IO_APIC_route_entry entry;
31617 +       unsigned long flags;
31618 +
31619 +       if (!IO_APIC_IRQ(irq)) {
31620 +               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
31621 +                       ioapic);
31622 +               return -EINVAL;
31623 +       }
31624 +
31625 +       /*
31626 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
31627 +        * Note that we mask (disable) IRQs now -- these get enabled when the
31628 +        * corresponding device driver registers for this IRQ.
31629 +        */
31630 +
31631 +       memset(&entry,0,sizeof(entry));
31632 +
31633 +       entry.delivery_mode = INT_DELIVERY_MODE;
31634 +       entry.dest_mode = INT_DEST_MODE;
31635 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
31636 +       entry.trigger = edge_level;
31637 +       entry.polarity = active_high_low;
31638 +       entry.mask = 1;                                  /* Disabled (masked) */
31639 +
31640 +       irq = gsi_irq_sharing(irq);
31641 +       /*
31642 +        * IRQs < 16 are already in the irq_2_pin[] map
31643 +        */
31644 +       if (irq >= 16)
31645 +               add_pin_to_irq(irq, ioapic, pin);
31646 +
31647 +       entry.vector = assign_irq_vector(irq);
31648 +
31649 +       apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
31650 +               "IRQ %d Mode:%i Active:%i)\n", ioapic, 
31651 +              mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
31652 +              edge_level, active_high_low);
31653 +
31654 +       ioapic_register_intr(irq, entry.vector, edge_level);
31655 +
31656 +       if (!ioapic && (irq < 16))
31657 +               disable_8259A_irq(irq);
31658 +
31659 +       spin_lock_irqsave(&ioapic_lock, flags);
31660 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
31661 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
31662 +       set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
31663 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31664 +
31665 +       return 0;
31666 +}
31667 +
31668 +#endif /* CONFIG_ACPI */
31669 +
31670 +
31671 +#ifndef CONFIG_XEN
31672 +/*
31673 + * This function currently is only a helper for the i386 smp boot process where
31674 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
31675 + * so mask in all cases should simply be TARGET_CPUS
31676 + */
31677 +#ifdef CONFIG_SMP
31678 +void __init setup_ioapic_dest(void)
31679 +{
31680 +       int pin, ioapic, irq, irq_entry;
31681 +
31682 +       if (skip_ioapic_setup == 1)
31683 +               return;
31684 +
31685 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
31686 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
31687 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
31688 +                       if (irq_entry == -1)
31689 +                               continue;
31690 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
31691 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
31692 +               }
31693 +
31694 +       }
31695 +}
31696 +#endif
31697 +#endif /* !CONFIG_XEN */
31698 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/ioport-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/ioport-xen.c
31699 --- ref-linux-2.6.16.9/arch/x86_64/kernel/ioport-xen.c  1970-01-01 01:00:00.000000000 +0100
31700 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/ioport-xen.c     2006-04-10 00:05:52.000000000 +0200
31701 @@ -0,0 +1,58 @@
31702 +/*
31703 + *     linux/arch/x86_64/kernel/ioport.c
31704 + *
31705 + * This contains the io-permission bitmap code - written by obz, with changes
31706 + * by Linus.
31707 + */
31708 +
31709 +#include <linux/sched.h>
31710 +#include <linux/kernel.h>
31711 +#include <linux/capability.h>
31712 +#include <linux/errno.h>
31713 +#include <linux/types.h>
31714 +#include <linux/ioport.h>
31715 +#include <linux/mm.h>
31716 +#include <linux/smp.h>
31717 +#include <linux/smp_lock.h>
31718 +#include <linux/stddef.h>
31719 +#include <linux/slab.h>
31720 +#include <linux/thread_info.h>
31721 +#include <xen/interface/physdev.h>
31722 +
31723 +/*
31724 + * sys_iopl has to be used when you want to access the IO ports
31725 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
31726 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
31727 + *
31728 + */
31729 +
31730 +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
31731 +{
31732 +        unsigned int old_iopl = current->thread.iopl;
31733 +        physdev_op_t op;
31734 +
31735 +       if (new_iopl > 3)
31736 +               return -EINVAL;
31737 +
31738 +       /* Need "raw I/O" privileges for direct port access. */
31739 +       if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
31740 +               return -EPERM;
31741 +
31742 +       /* Change our version of the privilege levels. */
31743 +       current->thread.iopl = new_iopl;
31744 +
31745 +       /* Force the change at ring 0. */
31746 +       op.cmd             = PHYSDEVOP_SET_IOPL;
31747 +       op.u.set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
31748 +       HYPERVISOR_physdev_op(&op);
31749 +
31750 +       return 0;
31751 +}
31752 +
31753 +/*
31754 + * this changes the io permissions bitmap in the current task.
31755 + */
31756 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
31757 +{
31758 +  return turn_on ? sys_iopl(3, NULL) : 0;
31759 +}
31760 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/irq-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/irq-xen.c
31761 --- ref-linux-2.6.16.9/arch/x86_64/kernel/irq-xen.c     1970-01-01 01:00:00.000000000 +0100
31762 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/irq-xen.c        2006-04-10 00:05:52.000000000 +0200
31763 @@ -0,0 +1,168 @@
31764 +/*
31765 + *     linux/arch/x86_64/kernel/irq.c
31766 + *
31767 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
31768 + *
31769 + * This file contains the lowest level x86_64-specific interrupt
31770 + * entry and irq statistics code. All the remaining irq logic is
31771 + * done by the generic kernel/irq/ code and in the
31772 + * x86_64-specific irq controller code. (e.g. i8259.c and
31773 + * io_apic.c.)
31774 + */
31775 +
31776 +#include <linux/kernel_stat.h>
31777 +#include <linux/interrupt.h>
31778 +#include <linux/seq_file.h>
31779 +#include <linux/module.h>
31780 +#include <linux/delay.h>
31781 +#include <asm/uaccess.h>
31782 +#include <asm/io_apic.h>
31783 +#include <asm/idle.h>
31784 +
31785 +atomic_t irq_err_count;
31786 +#ifdef CONFIG_X86_IO_APIC
31787 +#ifdef APIC_MISMATCH_DEBUG
31788 +atomic_t irq_mis_count;
31789 +#endif
31790 +#endif
31791 +
31792 +/*
31793 + * Generic, controller-independent functions:
31794 + */
31795 +
31796 +int show_interrupts(struct seq_file *p, void *v)
31797 +{
31798 +       int i = *(loff_t *) v, j;
31799 +       struct irqaction * action;
31800 +       unsigned long flags;
31801 +
31802 +       if (i == 0) {
31803 +               seq_printf(p, "           ");
31804 +               for (j=0; j<NR_CPUS; j++)
31805 +                       if (cpu_online(j))
31806 +                               seq_printf(p, "CPU%d       ",j);
31807 +               seq_putc(p, '\n');
31808 +       }
31809 +
31810 +       if (i < NR_IRQS) {
31811 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
31812 +               action = irq_desc[i].action;
31813 +               if (!action) 
31814 +                       goto skip;
31815 +               seq_printf(p, "%3d: ",i);
31816 +#ifndef CONFIG_SMP
31817 +               seq_printf(p, "%10u ", kstat_irqs(i));
31818 +#else
31819 +               for (j=0; j<NR_CPUS; j++)
31820 +                       if (cpu_online(j))
31821 +                       seq_printf(p, "%10u ",
31822 +                               kstat_cpu(j).irqs[i]);
31823 +#endif
31824 +               seq_printf(p, " %14s", irq_desc[i].handler->typename);
31825 +
31826 +               seq_printf(p, "  %s", action->name);
31827 +               for (action=action->next; action; action = action->next)
31828 +                       seq_printf(p, ", %s", action->name);
31829 +               seq_putc(p, '\n');
31830 +skip:
31831 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
31832 +       } else if (i == NR_IRQS) {
31833 +               seq_printf(p, "NMI: ");
31834 +               for (j = 0; j < NR_CPUS; j++)
31835 +                       if (cpu_online(j))
31836 +                               seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
31837 +               seq_putc(p, '\n');
31838 +#ifdef CONFIG_X86_LOCAL_APIC
31839 +               seq_printf(p, "LOC: ");
31840 +               for (j = 0; j < NR_CPUS; j++)
31841 +                       if (cpu_online(j))
31842 +                               seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
31843 +               seq_putc(p, '\n');
31844 +#endif
31845 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
31846 +#ifdef CONFIG_X86_IO_APIC
31847 +#ifdef APIC_MISMATCH_DEBUG
31848 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
31849 +#endif
31850 +#endif
31851 +       }
31852 +       return 0;
31853 +}
31854 +
31855 +/*
31856 + * do_IRQ handles all normal device IRQ's (the special
31857 + * SMP cross-CPU interrupts have their own specific
31858 + * handlers).
31859 + */
31860 +asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
31861 +{      
31862 +       /* high bit used in ret_from_ code  */
31863 +       unsigned irq = ~regs->orig_rax;
31864 +
31865 +       exit_idle();
31866 +       irq_enter();
31867 +
31868 +       __do_IRQ(irq, regs);
31869 +       irq_exit();
31870 +
31871 +       return 1;
31872 +}
31873 +
31874 +#ifdef CONFIG_HOTPLUG_CPU
31875 +void fixup_irqs(cpumask_t map)
31876 +{
31877 +       unsigned int irq;
31878 +       static int warned;
31879 +
31880 +       for (irq = 0; irq < NR_IRQS; irq++) {
31881 +               cpumask_t mask;
31882 +               if (irq == 2)
31883 +                       continue;
31884 +
31885 +               cpus_and(mask, irq_affinity[irq], map);
31886 +               if (any_online_cpu(mask) == NR_CPUS) {
31887 +                       printk("Breaking affinity for irq %i\n", irq);
31888 +                       mask = map;
31889 +               }
31890 +               if (irq_desc[irq].handler->set_affinity)
31891 +                       irq_desc[irq].handler->set_affinity(irq, mask);
31892 +               else if (irq_desc[irq].action && !(warned++))
31893 +                       printk("Cannot set affinity for irq %i\n", irq);
31894 +       }
31895 +
31896 +       /* That doesn't seem sufficient.  Give it 1ms. */
31897 +       local_irq_enable();
31898 +       mdelay(1);
31899 +       local_irq_disable();
31900 +}
31901 +#endif
31902 +
31903 +extern void call_softirq(void);
31904 +
31905 +asmlinkage void do_softirq(void)
31906 +{
31907 +       __u32 pending;
31908 +       unsigned long flags;
31909 +
31910 +       if (in_interrupt())
31911 +               return;
31912 +
31913 +       local_irq_save(flags);
31914 +       pending = local_softirq_pending();
31915 +       /* Switch to interrupt stack */
31916 +       if (pending)
31917 +               call_softirq();
31918 +       local_irq_restore(flags);
31919 +}
31920 +EXPORT_SYMBOL(do_softirq);
31921 +
31922 +#ifndef CONFIG_X86_LOCAL_APIC
31923 +/*
31924 + * 'what should we do if we get a hw irq event on an illegal vector'.
31925 + * each architecture has to answer this themselves.
31926 + */
31927 +void ack_bad_irq(unsigned int irq)
31928 +{
31929 +        printk("unexpected IRQ trap at vector %02x\n", irq);
31930 +}
31931 +#endif
31932 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/ldt-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/ldt-xen.c
31933 --- ref-linux-2.6.16.9/arch/x86_64/kernel/ldt-xen.c     1970-01-01 01:00:00.000000000 +0100
31934 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/ldt-xen.c        2006-04-10 00:05:52.000000000 +0200
31935 @@ -0,0 +1,282 @@
31936 +/*
31937 + * linux/arch/x86_64/kernel/ldt.c
31938 + *
31939 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
31940 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
31941 + * Copyright (C) 2002 Andi Kleen
31942 + * 
31943 + * This handles calls from both 32bit and 64bit mode.
31944 + */
31945 +
31946 +#include <linux/errno.h>
31947 +#include <linux/sched.h>
31948 +#include <linux/string.h>
31949 +#include <linux/mm.h>
31950 +#include <linux/smp.h>
31951 +#include <linux/smp_lock.h>
31952 +#include <linux/vmalloc.h>
31953 +#include <linux/slab.h>
31954 +
31955 +#include <asm/uaccess.h>
31956 +#include <asm/system.h>
31957 +#include <asm/ldt.h>
31958 +#include <asm/desc.h>
31959 +#include <asm/proto.h>
31960 +#include <asm/pgalloc.h>
31961 +
31962 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
31963 +static void flush_ldt(void *null)
31964 +{
31965 +       if (current->active_mm)
31966 +               load_LDT(&current->active_mm->context);
31967 +}
31968 +#endif
31969 +
31970 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
31971 +{
31972 +       void *oldldt;
31973 +       void *newldt;
31974 +       unsigned oldsize;
31975 +
31976 +       if (mincount <= (unsigned)pc->size)
31977 +               return 0;
31978 +       oldsize = pc->size;
31979 +       mincount = (mincount+511)&(~511);
31980 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
31981 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
31982 +       else
31983 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
31984 +
31985 +       if (!newldt)
31986 +               return -ENOMEM;
31987 +
31988 +       if (oldsize)
31989 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
31990 +       oldldt = pc->ldt;
31991 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
31992 +       wmb();
31993 +       pc->ldt = newldt;
31994 +       wmb();
31995 +       pc->size = mincount;
31996 +       wmb();
31997 +       if (reload) {
31998 +#ifdef CONFIG_SMP
31999 +               cpumask_t mask;
32000 +
32001 +               preempt_disable();
32002 +#endif
32003 +               make_pages_readonly(
32004 +                       pc->ldt,
32005 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
32006 +                       XENFEAT_writable_descriptor_tables);
32007 +               load_LDT(pc);
32008 +#ifdef CONFIG_SMP
32009 +               mask = cpumask_of_cpu(smp_processor_id());
32010 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
32011 +                       smp_call_function(flush_ldt, NULL, 1, 1);
32012 +               preempt_enable();
32013 +#endif
32014 +       }
32015 +       if (oldsize) {
32016 +               make_pages_writable(
32017 +                       oldldt,
32018 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
32019 +                       XENFEAT_writable_descriptor_tables);
32020 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
32021 +                       vfree(oldldt);
32022 +               else
32023 +                       kfree(oldldt);
32024 +       }
32025 +       return 0;
32026 +}
32027 +
32028 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
32029 +{
32030 +       int err = alloc_ldt(new, old->size, 0);
32031 +       if (err < 0)
32032 +               return err;
32033 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
32034 +       make_pages_readonly(
32035 +               new->ldt,
32036 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
32037 +               XENFEAT_writable_descriptor_tables);
32038 +       return 0;
32039 +}
32040 +
32041 +/*
32042 + * we do not have to muck with descriptors here, that is
32043 + * done in switch_mm() as needed.
32044 + */
32045 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
32046 +{
32047 +       struct mm_struct * old_mm;
32048 +       int retval = 0;
32049 +
32050 +       memset(&mm->context, 0, sizeof(mm->context));
32051 +       init_MUTEX(&mm->context.sem);
32052 +       old_mm = current->mm;
32053 +       if (old_mm && old_mm->context.size > 0) {
32054 +               down(&old_mm->context.sem);
32055 +               retval = copy_ldt(&mm->context, &old_mm->context);
32056 +               up(&old_mm->context.sem);
32057 +       }
32058 +       if (retval == 0) {
32059 +               spin_lock(&mm_unpinned_lock);
32060 +               list_add(&mm->context.unpinned, &mm_unpinned);
32061 +               spin_unlock(&mm_unpinned_lock);
32062 +       }
32063 +       return retval;
32064 +}
32065 +
32066 +/*
32067 + * 
32068 + * Don't touch the LDT register - we're already in the next thread.
32069 + */
32070 +void destroy_context(struct mm_struct *mm)
32071 +{
32072 +       if (mm->context.size) {
32073 +               if (mm == current->active_mm)
32074 +                       clear_LDT();
32075 +               make_pages_writable(
32076 +                       mm->context.ldt,
32077 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
32078 +                       XENFEAT_writable_descriptor_tables);
32079 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
32080 +                       vfree(mm->context.ldt);
32081 +               else
32082 +                       kfree(mm->context.ldt);
32083 +               mm->context.size = 0;
32084 +       }
32085 +       if (!mm->context.pinned) {
32086 +               spin_lock(&mm_unpinned_lock);
32087 +               list_del(&mm->context.unpinned);
32088 +               spin_unlock(&mm_unpinned_lock);
32089 +       }
32090 +}
32091 +
32092 +static int read_ldt(void __user * ptr, unsigned long bytecount)
32093 +{
32094 +       int err;
32095 +       unsigned long size;
32096 +       struct mm_struct * mm = current->mm;
32097 +
32098 +       if (!mm->context.size)
32099 +               return 0;
32100 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
32101 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
32102 +
32103 +       down(&mm->context.sem);
32104 +       size = mm->context.size*LDT_ENTRY_SIZE;
32105 +       if (size > bytecount)
32106 +               size = bytecount;
32107 +
32108 +       err = 0;
32109 +       if (copy_to_user(ptr, mm->context.ldt, size))
32110 +               err = -EFAULT;
32111 +       up(&mm->context.sem);
32112 +       if (err < 0)
32113 +               goto error_return;
32114 +       if (size != bytecount) {
32115 +               /* zero-fill the rest */
32116 +               if (clear_user(ptr+size, bytecount-size) != 0) {
32117 +                       err = -EFAULT;
32118 +                       goto error_return;
32119 +               }
32120 +       }
32121 +       return bytecount;
32122 +error_return:
32123 +       return err;
32124 +}
32125 +
32126 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
32127 +{
32128 +       /* Arbitrary number */ 
32129 +       /* x86-64 default LDT is all zeros */
32130 +       if (bytecount > 128) 
32131 +               bytecount = 128;        
32132 +       if (clear_user(ptr, bytecount))
32133 +               return -EFAULT;
32134 +       return bytecount; 
32135 +}
32136 +
32137 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
32138 +{
32139 +       struct task_struct *me = current;
32140 +       struct mm_struct * mm = me->mm;
32141 +       __u32 entry_1, entry_2, *lp;
32142 +       unsigned long mach_lp;
32143 +       int error;
32144 +       struct user_desc ldt_info;
32145 +
32146 +       error = -EINVAL;
32147 +
32148 +       if (bytecount != sizeof(ldt_info))
32149 +               goto out;
32150 +       error = -EFAULT;        
32151 +       if (copy_from_user(&ldt_info, ptr, bytecount))
32152 +               goto out;
32153 +
32154 +       error = -EINVAL;
32155 +       if (ldt_info.entry_number >= LDT_ENTRIES)
32156 +               goto out;
32157 +       if (ldt_info.contents == 3) {
32158 +               if (oldmode)
32159 +                       goto out;
32160 +               if (ldt_info.seg_not_present == 0)
32161 +                       goto out;
32162 +       }
32163 +
32164 +       down(&mm->context.sem);
32165 +       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
32166 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
32167 +               if (error < 0)
32168 +                       goto out_unlock;
32169 +       }
32170 +
32171 +       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
32172 +       mach_lp = arbitrary_virt_to_machine(lp);
32173 +
32174 +       /* Allow LDTs to be cleared by the user. */
32175 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
32176 +               if (oldmode || LDT_empty(&ldt_info)) {
32177 +                       entry_1 = 0;
32178 +                       entry_2 = 0;
32179 +                       goto install;
32180 +               }
32181 +       }
32182 +
32183 +       entry_1 = LDT_entry_a(&ldt_info);
32184 +       entry_2 = LDT_entry_b(&ldt_info);
32185 +       if (oldmode)
32186 +               entry_2 &= ~(1 << 20);
32187 +
32188 +       /* Install the new entry ...  */
32189 +install:
32190 +       error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
32191 +
32192 +out_unlock:
32193 +       up(&mm->context.sem);
32194 +out:
32195 +       return error;
32196 +}
32197 +
32198 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
32199 +{
32200 +       int ret = -ENOSYS;
32201 +
32202 +       switch (func) {
32203 +       case 0:
32204 +               ret = read_ldt(ptr, bytecount);
32205 +               break;
32206 +       case 1:
32207 +               ret = write_ldt(ptr, bytecount, 1);
32208 +               break;
32209 +       case 2:
32210 +               ret = read_default_ldt(ptr, bytecount);
32211 +               break;
32212 +       case 0x11:
32213 +               ret = write_ldt(ptr, bytecount, 0);
32214 +               break;
32215 +       }
32216 +       return ret;
32217 +}
32218 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/Makefile tmp-linux-2.6-xen.patch/arch/x86_64/kernel/Makefile
32219 --- ref-linux-2.6.16.9/arch/x86_64/kernel/Makefile      2006-04-19 08:10:14.000000000 +0200
32220 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/Makefile 2006-04-10 00:05:52.000000000 +0200
32221 @@ -20,11 +20,13 @@ obj-$(CONFIG_MICROCODE)             += microcode.o
32222  obj-$(CONFIG_X86_CPUID)                += cpuid.o
32223  obj-$(CONFIG_SMP)              += smp.o smpboot.o trampoline.o
32224  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o  nmi.o
32225 +obj-$(CONFIG_X86_XEN_GENAPIC)  += genapic.o genapic_xen.o
32226  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o mpparse.o \
32227                 genapic.o genapic_cluster.o genapic_flat.o
32228  obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o crash.o
32229  obj-$(CONFIG_CRASH_DUMP)       += crash_dump.o
32230 -obj-$(CONFIG_PM)               += suspend.o
32231 +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
32232 +obj-$(CONFIG_ACPI_SLEEP)       += suspend.o
32233  obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
32234  obj-$(CONFIG_CPU_FREQ)         += cpufreq/
32235  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
32236 @@ -51,3 +53,17 @@ i8237-y                              += ../../i386/kernel/i8237.o
32237  msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
32238  dmi_scan-y                     += ../../i386/kernel/dmi_scan.o
32239  
32240 +ifdef CONFIG_XEN
32241 +time-y                         += ../../i386/kernel/time-xen.o
32242 +pci-dma-y                      += ../../i386/kernel/pci-dma-xen.o
32243 +microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
32244 +quirks-y                       := ../../i386/kernel/quirks-xen.o
32245 +
32246 +n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
32247 +
32248 +include $(srctree)/scripts/Makefile.xen
32249 +
32250 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
32251 +obj-y := $(call cherrypickxen, $(obj-y))
32252 +extra-y := $(call cherrypickxen, $(extra-y))
32253 +endif
32254 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/mpparse-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/mpparse-xen.c
32255 --- ref-linux-2.6.16.9/arch/x86_64/kernel/mpparse-xen.c 1970-01-01 01:00:00.000000000 +0100
32256 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/mpparse-xen.c    2006-04-10 00:05:52.000000000 +0200
32257 @@ -0,0 +1,1005 @@
32258 +/*
32259 + *     Intel Multiprocessor Specification 1.1 and 1.4
32260 + *     compliant MP-table parsing routines.
32261 + *
32262 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
32263 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
32264 + *
32265 + *     Fixes
32266 + *             Erich Boleyn    :       MP v1.4 and additional changes.
32267 + *             Alan Cox        :       Added EBDA scanning
32268 + *             Ingo Molnar     :       various cleanups and rewrites
32269 + *             Maciej W. Rozycki:      Bits for default MP configurations
32270 + *             Paul Diefenbaugh:       Added full ACPI support
32271 + */
32272 +
32273 +#include <linux/mm.h>
32274 +#include <linux/init.h>
32275 +#include <linux/delay.h>
32276 +#include <linux/config.h>
32277 +#include <linux/bootmem.h>
32278 +#include <linux/smp_lock.h>
32279 +#include <linux/kernel_stat.h>
32280 +#include <linux/mc146818rtc.h>
32281 +#include <linux/acpi.h>
32282 +#include <linux/module.h>
32283 +
32284 +#include <asm/smp.h>
32285 +#include <asm/mtrr.h>
32286 +#include <asm/mpspec.h>
32287 +#include <asm/pgalloc.h>
32288 +#include <asm/io_apic.h>
32289 +#include <asm/proto.h>
32290 +#include <asm/acpi.h>
32291 +
32292 +/* Have we found an MP table */
32293 +int smp_found_config;
32294 +unsigned int __initdata maxcpus = NR_CPUS;
32295 +
32296 +int acpi_found_madt;
32297 +
32298 +/*
32299 + * Various Linux-internal data structures created from the
32300 + * MP-table.
32301 + */
32302 +unsigned char apic_version [MAX_APICS];
32303 +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
32304 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
32305 +
32306 +static int mp_current_pci_id = 0;
32307 +/* I/O APIC entries */
32308 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
32309 +
32310 +/* # of MP IRQ source entries */
32311 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
32312 +
32313 +/* MP IRQ source entries */
32314 +int mp_irq_entries;
32315 +
32316 +int nr_ioapics;
32317 +int pic_mode;
32318 +unsigned long mp_lapic_addr = 0;
32319 +
32320 +
32321 +
32322 +/* Processor that is doing the boot up */
32323 +unsigned int boot_cpu_id = -1U;
32324 +/* Internal processor count */
32325 +unsigned int num_processors __initdata = 0;
32326 +
32327 +unsigned disabled_cpus __initdata;
32328 +
32329 +/* Bitmask of physically existing CPUs */
32330 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
32331 +
32332 +/* ACPI MADT entry parsing functions */
32333 +#ifdef CONFIG_ACPI
32334 +extern struct acpi_boot_flags acpi_boot;
32335 +#ifdef CONFIG_X86_LOCAL_APIC
32336 +extern int acpi_parse_lapic (acpi_table_entry_header *header);
32337 +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
32338 +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
32339 +#endif /*CONFIG_X86_LOCAL_APIC*/
32340 +#ifdef CONFIG_X86_IO_APIC
32341 +extern int acpi_parse_ioapic (acpi_table_entry_header *header);
32342 +#endif /*CONFIG_X86_IO_APIC*/
32343 +#endif /*CONFIG_ACPI*/
32344 +
32345 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
32346 +
32347 +
32348 +/*
32349 + * Intel MP BIOS table parsing routines:
32350 + */
32351 +
32352 +/*
32353 + * Checksum an MP configuration block.
32354 + */
32355 +
32356 +static int __init mpf_checksum(unsigned char *mp, int len)
32357 +{
32358 +       int sum = 0;
32359 +
32360 +       while (len--)
32361 +               sum += *mp++;
32362 +
32363 +       return sum & 0xFF;
32364 +}
32365 +
32366 +#ifndef CONFIG_XEN
32367 +static void __init MP_processor_info (struct mpc_config_processor *m)
32368 +{
32369 +       int cpu;
32370 +       unsigned char ver;
32371 +       static int found_bsp=0;
32372 +
32373 +       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
32374 +               disabled_cpus++;
32375 +               return;
32376 +       }
32377 +
32378 +       printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
32379 +               m->mpc_apicid,
32380 +              (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
32381 +              (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
32382 +               m->mpc_apicver);
32383 +
32384 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
32385 +               Dprintk("    Bootup CPU\n");
32386 +               boot_cpu_id = m->mpc_apicid;
32387 +       }
32388 +       if (num_processors >= NR_CPUS) {
32389 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
32390 +                       " Processor ignored.\n", NR_CPUS);
32391 +               return;
32392 +       }
32393 +
32394 +       cpu = num_processors++;
32395 +       
32396 +#if MAX_APICS < 255    
32397 +       if ((int)m->mpc_apicid > MAX_APICS) {
32398 +               printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
32399 +                       m->mpc_apicid, MAX_APICS);
32400 +               return;
32401 +       }
32402 +#endif
32403 +       ver = m->mpc_apicver;
32404 +
32405 +       physid_set(m->mpc_apicid, phys_cpu_present_map);
32406 +       /*
32407 +        * Validate version
32408 +        */
32409 +       if (ver == 0x0) {
32410 +               printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
32411 +               ver = 0x10;
32412 +       }
32413 +       apic_version[m->mpc_apicid] = ver;
32414 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
32415 +               /*
32416 +                * bios_cpu_apicid is required to have processors listed
32417 +                * in same order as logical cpu numbers. Hence the first
32418 +                * entry is BSP, and so on.
32419 +                */
32420 +               cpu = 0;
32421 +
32422 +               bios_cpu_apicid[0] = m->mpc_apicid;
32423 +               x86_cpu_to_apicid[0] = m->mpc_apicid;
32424 +               found_bsp = 1;
32425 +       } else
32426 +               cpu = num_processors - found_bsp;
32427 +       bios_cpu_apicid[cpu] = m->mpc_apicid;
32428 +       x86_cpu_to_apicid[cpu] = m->mpc_apicid;
32429 +
32430 +       cpu_set(cpu, cpu_possible_map);
32431 +       cpu_set(cpu, cpu_present_map);
32432 +}
32433 +#else
32434 +void __init MP_processor_info (struct mpc_config_processor *m)
32435 +{
32436 +       num_processors++;
32437 +}
32438 +#endif /* CONFIG_XEN */
32439 +
32440 +static void __init MP_bus_info (struct mpc_config_bus *m)
32441 +{
32442 +       char str[7];
32443 +
32444 +       memcpy(str, m->mpc_bustype, 6);
32445 +       str[6] = 0;
32446 +       Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
32447 +
32448 +       if (strncmp(str, "ISA", 3) == 0) {
32449 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
32450 +       } else if (strncmp(str, "EISA", 4) == 0) {
32451 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
32452 +       } else if (strncmp(str, "PCI", 3) == 0) {
32453 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
32454 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
32455 +               mp_current_pci_id++;
32456 +       } else if (strncmp(str, "MCA", 3) == 0) {
32457 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
32458 +       } else {
32459 +               printk(KERN_ERR "Unknown bustype %s\n", str);
32460 +       }
32461 +}
32462 +
32463 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
32464 +{
32465 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
32466 +               return;
32467 +
32468 +       printk("I/O APIC #%d Version %d at 0x%X.\n",
32469 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
32470 +       if (nr_ioapics >= MAX_IO_APICS) {
32471 +               printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
32472 +                       MAX_IO_APICS, nr_ioapics);
32473 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
32474 +       }
32475 +       if (!m->mpc_apicaddr) {
32476 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
32477 +                       " found in MP table, skipping!\n");
32478 +               return;
32479 +       }
32480 +       mp_ioapics[nr_ioapics] = *m;
32481 +       nr_ioapics++;
32482 +}
32483 +
32484 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
32485 +{
32486 +       mp_irqs [mp_irq_entries] = *m;
32487 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
32488 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
32489 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
32490 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
32491 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
32492 +       if (++mp_irq_entries >= MAX_IRQ_SOURCES)
32493 +               panic("Max # of irq sources exceeded!!\n");
32494 +}
32495 +
32496 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
32497 +{
32498 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
32499 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
32500 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
32501 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
32502 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
32503 +       /*
32504 +        * Well it seems all SMP boards in existence
32505 +        * use ExtINT/LVT1 == LINT0 and
32506 +        * NMI/LVT2 == LINT1 - the following check
32507 +        * will show us if this assumptions is false.
32508 +        * Until then we do not have to add baggage.
32509 +        */
32510 +       if ((m->mpc_irqtype == mp_ExtINT) &&
32511 +               (m->mpc_destapiclint != 0))
32512 +                       BUG();
32513 +       if ((m->mpc_irqtype == mp_NMI) &&
32514 +               (m->mpc_destapiclint != 1))
32515 +                       BUG();
32516 +}
32517 +
32518 +/*
32519 + * Read/parse the MPC
32520 + */
32521 +
32522 +static int __init smp_read_mpc(struct mp_config_table *mpc)
32523 +{
32524 +       char str[16];
32525 +       int count=sizeof(*mpc);
32526 +       unsigned char *mpt=((unsigned char *)mpc)+count;
32527 +
32528 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
32529 +               printk("SMP mptable: bad signature [%c%c%c%c]!\n",
32530 +                       mpc->mpc_signature[0],
32531 +                       mpc->mpc_signature[1],
32532 +                       mpc->mpc_signature[2],
32533 +                       mpc->mpc_signature[3]);
32534 +               return 0;
32535 +       }
32536 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
32537 +               printk("SMP mptable: checksum error!\n");
32538 +               return 0;
32539 +       }
32540 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
32541 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
32542 +                       mpc->mpc_spec);
32543 +               return 0;
32544 +       }
32545 +       if (!mpc->mpc_lapic) {
32546 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
32547 +               return 0;
32548 +       }
32549 +       memcpy(str,mpc->mpc_oem,8);
32550 +       str[8]=0;
32551 +       printk(KERN_INFO "OEM ID: %s ",str);
32552 +
32553 +       memcpy(str,mpc->mpc_productid,12);
32554 +       str[12]=0;
32555 +       printk("Product ID: %s ",str);
32556 +
32557 +       printk("APIC at: 0x%X\n",mpc->mpc_lapic);
32558 +
32559 +       /* save the local APIC address, it might be non-default */
32560 +       if (!acpi_lapic)
32561 +       mp_lapic_addr = mpc->mpc_lapic;
32562 +
32563 +       /*
32564 +        *      Now process the configuration blocks.
32565 +        */
32566 +       while (count < mpc->mpc_length) {
32567 +               switch(*mpt) {
32568 +                       case MP_PROCESSOR:
32569 +                       {
32570 +                               struct mpc_config_processor *m=
32571 +                                       (struct mpc_config_processor *)mpt;
32572 +                               if (!acpi_lapic)
32573 +                               MP_processor_info(m);
32574 +                               mpt += sizeof(*m);
32575 +                               count += sizeof(*m);
32576 +                               break;
32577 +                       }
32578 +                       case MP_BUS:
32579 +                       {
32580 +                               struct mpc_config_bus *m=
32581 +                                       (struct mpc_config_bus *)mpt;
32582 +                               MP_bus_info(m);
32583 +                               mpt += sizeof(*m);
32584 +                               count += sizeof(*m);
32585 +                               break;
32586 +                       }
32587 +                       case MP_IOAPIC:
32588 +                       {
32589 +                               struct mpc_config_ioapic *m=
32590 +                                       (struct mpc_config_ioapic *)mpt;
32591 +                               MP_ioapic_info(m);
32592 +                               mpt+=sizeof(*m);
32593 +                               count+=sizeof(*m);
32594 +                               break;
32595 +                       }
32596 +                       case MP_INTSRC:
32597 +                       {
32598 +                               struct mpc_config_intsrc *m=
32599 +                                       (struct mpc_config_intsrc *)mpt;
32600 +
32601 +                               MP_intsrc_info(m);
32602 +                               mpt+=sizeof(*m);
32603 +                               count+=sizeof(*m);
32604 +                               break;
32605 +                       }
32606 +                       case MP_LINTSRC:
32607 +                       {
32608 +                               struct mpc_config_lintsrc *m=
32609 +                                       (struct mpc_config_lintsrc *)mpt;
32610 +                               MP_lintsrc_info(m);
32611 +                               mpt+=sizeof(*m);
32612 +                               count+=sizeof(*m);
32613 +                               break;
32614 +                       }
32615 +               }
32616 +       }
32617 +       clustered_apic_check();
32618 +       if (!num_processors)
32619 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
32620 +       return num_processors;
32621 +}
32622 +
32623 +static int __init ELCR_trigger(unsigned int irq)
32624 +{
32625 +       unsigned int port;
32626 +
32627 +       port = 0x4d0 + (irq >> 3);
32628 +       return (inb(port) >> (irq & 7)) & 1;
32629 +}
32630 +
32631 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
32632 +{
32633 +       struct mpc_config_intsrc intsrc;
32634 +       int i;
32635 +       int ELCR_fallback = 0;
32636 +
32637 +       intsrc.mpc_type = MP_INTSRC;
32638 +       intsrc.mpc_irqflag = 0;                 /* conforming */
32639 +       intsrc.mpc_srcbus = 0;
32640 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
32641 +
32642 +       intsrc.mpc_irqtype = mp_INT;
32643 +
32644 +       /*
32645 +        *  If true, we have an ISA/PCI system with no IRQ entries
32646 +        *  in the MP table. To prevent the PCI interrupts from being set up
32647 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
32648 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
32649 +        *  never be level sensitive, so we simply see if the ELCR agrees.
32650 +        *  If it does, we assume it's valid.
32651 +        */
32652 +       if (mpc_default_type == 5) {
32653 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
32654 +
32655 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
32656 +                       printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
32657 +               else {
32658 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
32659 +                       ELCR_fallback = 1;
32660 +               }
32661 +       }
32662 +
32663 +       for (i = 0; i < 16; i++) {
32664 +               switch (mpc_default_type) {
32665 +               case 2:
32666 +                       if (i == 0 || i == 13)
32667 +                               continue;       /* IRQ0 & IRQ13 not connected */
32668 +                       /* fall through */
32669 +               default:
32670 +                       if (i == 2)
32671 +                               continue;       /* IRQ2 is never connected */
32672 +               }
32673 +
32674 +               if (ELCR_fallback) {
32675 +                       /*
32676 +                        *  If the ELCR indicates a level-sensitive interrupt, we
32677 +                        *  copy that information over to the MP table in the
32678 +                        *  irqflag field (level sensitive, active high polarity).
32679 +                        */
32680 +                       if (ELCR_trigger(i))
32681 +                               intsrc.mpc_irqflag = 13;
32682 +                       else
32683 +                               intsrc.mpc_irqflag = 0;
32684 +               }
32685 +
32686 +               intsrc.mpc_srcbusirq = i;
32687 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
32688 +               MP_intsrc_info(&intsrc);
32689 +       }
32690 +
32691 +       intsrc.mpc_irqtype = mp_ExtINT;
32692 +       intsrc.mpc_srcbusirq = 0;
32693 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
32694 +       MP_intsrc_info(&intsrc);
32695 +}
32696 +
32697 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
32698 +{
32699 +       struct mpc_config_processor processor;
32700 +       struct mpc_config_bus bus;
32701 +       struct mpc_config_ioapic ioapic;
32702 +       struct mpc_config_lintsrc lintsrc;
32703 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
32704 +       int i;
32705 +
32706 +       /*
32707 +        * local APIC has default address
32708 +        */
32709 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
32710 +
32711 +       /*
32712 +        * 2 CPUs, numbered 0 & 1.
32713 +        */
32714 +       processor.mpc_type = MP_PROCESSOR;
32715 +       /* Either an integrated APIC or a discrete 82489DX. */
32716 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
32717 +       processor.mpc_cpuflag = CPU_ENABLED;
32718 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
32719 +                                  (boot_cpu_data.x86_model << 4) |
32720 +                                  boot_cpu_data.x86_mask;
32721 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
32722 +       processor.mpc_reserved[0] = 0;
32723 +       processor.mpc_reserved[1] = 0;
32724 +       for (i = 0; i < 2; i++) {
32725 +               processor.mpc_apicid = i;
32726 +               MP_processor_info(&processor);
32727 +       }
32728 +
32729 +       bus.mpc_type = MP_BUS;
32730 +       bus.mpc_busid = 0;
32731 +       switch (mpc_default_type) {
32732 +               default:
32733 +                       printk(KERN_ERR "???\nUnknown standard configuration %d\n",
32734 +                               mpc_default_type);
32735 +                       /* fall through */
32736 +               case 1:
32737 +               case 5:
32738 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
32739 +                       break;
32740 +               case 2:
32741 +               case 6:
32742 +               case 3:
32743 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
32744 +                       break;
32745 +               case 4:
32746 +               case 7:
32747 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
32748 +       }
32749 +       MP_bus_info(&bus);
32750 +       if (mpc_default_type > 4) {
32751 +               bus.mpc_busid = 1;
32752 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
32753 +               MP_bus_info(&bus);
32754 +       }
32755 +
32756 +       ioapic.mpc_type = MP_IOAPIC;
32757 +       ioapic.mpc_apicid = 2;
32758 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
32759 +       ioapic.mpc_flags = MPC_APIC_USABLE;
32760 +       ioapic.mpc_apicaddr = 0xFEC00000;
32761 +       MP_ioapic_info(&ioapic);
32762 +
32763 +       /*
32764 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
32765 +        */
32766 +       construct_default_ioirq_mptable(mpc_default_type);
32767 +
32768 +       lintsrc.mpc_type = MP_LINTSRC;
32769 +       lintsrc.mpc_irqflag = 0;                /* conforming */
32770 +       lintsrc.mpc_srcbusid = 0;
32771 +       lintsrc.mpc_srcbusirq = 0;
32772 +       lintsrc.mpc_destapic = MP_APIC_ALL;
32773 +       for (i = 0; i < 2; i++) {
32774 +               lintsrc.mpc_irqtype = linttypes[i];
32775 +               lintsrc.mpc_destapiclint = i;
32776 +               MP_lintsrc_info(&lintsrc);
32777 +       }
32778 +}
32779 +
32780 +static struct intel_mp_floating *mpf_found;
32781 +
32782 +/*
32783 + * Scan the memory blocks for an SMP configuration block.
32784 + */
32785 +void __init get_smp_config (void)
32786 +{
32787 +       struct intel_mp_floating *mpf = mpf_found;
32788 +
32789 +       /*
32790 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
32791 +        * processors, where MPS only supports physical.
32792 +        */
32793 +       if (acpi_lapic && acpi_ioapic) {
32794 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
32795 +               return;
32796 +       }
32797 +       else if (acpi_lapic)
32798 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
32799 +
32800 +       printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
32801 +       if (mpf->mpf_feature2 & (1<<7)) {
32802 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
32803 +               pic_mode = 1;
32804 +       } else {
32805 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
32806 +               pic_mode = 0;
32807 +       }
32808 +
32809 +       /*
32810 +        * Now see if we need to read further.
32811 +        */
32812 +       if (mpf->mpf_feature1 != 0) {
32813 +
32814 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
32815 +               construct_default_ISA_mptable(mpf->mpf_feature1);
32816 +
32817 +       } else if (mpf->mpf_physptr) {
32818 +
32819 +               /*
32820 +                * Read the physical hardware table.  Anything here will
32821 +                * override the defaults.
32822 +                */
32823 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
32824 +                       smp_found_config = 0;
32825 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
32826 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
32827 +                       return;
32828 +               }
32829 +               /*
32830 +                * If there are no explicit MP IRQ entries, then we are
32831 +                * broken.  We set up most of the low 16 IO-APIC pins to
32832 +                * ISA defaults and hope it will work.
32833 +                */
32834 +               if (!mp_irq_entries) {
32835 +                       struct mpc_config_bus bus;
32836 +
32837 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
32838 +
32839 +                       bus.mpc_type = MP_BUS;
32840 +                       bus.mpc_busid = 0;
32841 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
32842 +                       MP_bus_info(&bus);
32843 +
32844 +                       construct_default_ioirq_mptable(0);
32845 +               }
32846 +
32847 +       } else
32848 +               BUG();
32849 +
32850 +       printk(KERN_INFO "Processors: %d\n", num_processors);
32851 +       /*
32852 +        * Only use the first configuration found.
32853 +        */
32854 +}
32855 +
32856 +static int __init smp_scan_config (unsigned long base, unsigned long length)
32857 +{
32858 +       extern void __bad_mpf_size(void); 
32859 +       unsigned int *bp = isa_bus_to_virt(base);
32860 +       struct intel_mp_floating *mpf;
32861 +
32862 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
32863 +       if (sizeof(*mpf) != 16)
32864 +               __bad_mpf_size();
32865 +
32866 +       while (length > 0) {
32867 +               mpf = (struct intel_mp_floating *)bp;
32868 +               if ((*bp == SMP_MAGIC_IDENT) &&
32869 +                       (mpf->mpf_length == 1) &&
32870 +                       !mpf_checksum((unsigned char *)bp, 16) &&
32871 +                       ((mpf->mpf_specification == 1)
32872 +                               || (mpf->mpf_specification == 4)) ) {
32873 +
32874 +                       smp_found_config = 1;
32875 +                       mpf_found = mpf;
32876 +                       return 1;
32877 +               }
32878 +               bp += 4;
32879 +               length -= 16;
32880 +       }
32881 +       return 0;
32882 +}
32883 +
32884 +void __init find_intel_smp (void)
32885 +{
32886 +       unsigned int address;
32887 +
32888 +       /*
32889 +        * FIXME: Linux assumes you have 640K of base ram..
32890 +        * this continues the error...
32891 +        *
32892 +        * 1) Scan the bottom 1K for a signature
32893 +        * 2) Scan the top 1K of base RAM
32894 +        * 3) Scan the 64K of bios
32895 +        */
32896 +       if (smp_scan_config(0x0,0x400) ||
32897 +               smp_scan_config(639*0x400,0x400) ||
32898 +                       smp_scan_config(0xF0000,0x10000))
32899 +               return;
32900 +       /*
32901 +        * If it is an SMP machine we should know now, unless the
32902 +        * configuration is in an EISA/MCA bus machine with an
32903 +        * extended bios data area.
32904 +        *
32905 +        * there is a real-mode segmented pointer pointing to the
32906 +        * 4K EBDA area at 0x40E, calculate and scan it here.
32907 +        *
32908 +        * NOTE! There are Linux loaders that will corrupt the EBDA
32909 +        * area, and as such this kind of SMP config may be less
32910 +        * trustworthy, simply because the SMP table may have been
32911 +        * stomped on during early boot. These loaders are buggy and
32912 +        * should be fixed.
32913 +        */
32914 +
32915 +       address = *(unsigned short *)phys_to_virt(0x40E);
32916 +       address <<= 4;
32917 +       if (smp_scan_config(address, 0x1000))
32918 +               return;
32919 +
32920 +       /* If we have come this far, we did not find an MP table  */
32921 +        printk(KERN_INFO "No mptable found.\n");
32922 +}
32923 +
32924 +/*
32925 + * - Intel MP Configuration Table
32926 + */
32927 +void __init find_smp_config (void)
32928 +{
32929 +#ifdef CONFIG_X86_LOCAL_APIC
32930 +       find_intel_smp();
32931 +#endif
32932 +}
32933 +
32934 +
32935 +/* --------------------------------------------------------------------------
32936 +                            ACPI-based MP Configuration
32937 +   -------------------------------------------------------------------------- */
32938 +
32939 +#ifdef CONFIG_ACPI
32940 +
32941 +void __init mp_register_lapic_address (
32942 +       u64                     address)
32943 +{
32944 +#ifndef CONFIG_XEN
32945 +       mp_lapic_addr = (unsigned long) address;
32946 +
32947 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
32948 +
32949 +       if (boot_cpu_id == -1U)
32950 +               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
32951 +
32952 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
32953 +#endif
32954 +}
32955 +
32956 +
32957 +void __init mp_register_lapic (
32958 +       u8                      id, 
32959 +       u8                      enabled)
32960 +{
32961 +       struct mpc_config_processor processor;
32962 +       int                     boot_cpu = 0;
32963 +       
32964 +       if (id >= MAX_APICS) {
32965 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
32966 +                       id, MAX_APICS);
32967 +               return;
32968 +       }
32969 +
32970 +       if (id == boot_cpu_physical_apicid)
32971 +               boot_cpu = 1;
32972 +
32973 +#ifndef CONFIG_XEN
32974 +       processor.mpc_type = MP_PROCESSOR;
32975 +       processor.mpc_apicid = id;
32976 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
32977 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
32978 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
32979 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
32980 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
32981 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
32982 +       processor.mpc_reserved[0] = 0;
32983 +       processor.mpc_reserved[1] = 0;
32984 +#endif
32985 +
32986 +       MP_processor_info(&processor);
32987 +}
32988 +
32989 +#ifdef CONFIG_X86_IO_APIC
32990 +
32991 +#define MP_ISA_BUS             0
32992 +#define MP_MAX_IOAPIC_PIN      127
32993 +
32994 +static struct mp_ioapic_routing {
32995 +       int                     apic_id;
32996 +       int                     gsi_start;
32997 +       int                     gsi_end;
32998 +       u32                     pin_programmed[4];
32999 +} mp_ioapic_routing[MAX_IO_APICS];
33000 +
33001 +
33002 +static int mp_find_ioapic (
33003 +       int                     gsi)
33004 +{
33005 +       int                     i = 0;
33006 +
33007 +       /* Find the IOAPIC that manages this GSI. */
33008 +       for (i = 0; i < nr_ioapics; i++) {
33009 +               if ((gsi >= mp_ioapic_routing[i].gsi_start)
33010 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
33011 +                       return i;
33012 +       }
33013 +
33014 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
33015 +
33016 +       return -1;
33017 +}
33018 +       
33019 +
33020 +void __init mp_register_ioapic (
33021 +       u8                      id, 
33022 +       u32                     address,
33023 +       u32                     gsi_base)
33024 +{
33025 +       int                     idx = 0;
33026 +
33027 +       if (nr_ioapics >= MAX_IO_APICS) {
33028 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
33029 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
33030 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
33031 +       }
33032 +       if (!address) {
33033 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
33034 +                       " found in MADT table, skipping!\n");
33035 +               return;
33036 +       }
33037 +
33038 +       idx = nr_ioapics++;
33039 +
33040 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
33041 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
33042 +       mp_ioapics[idx].mpc_apicaddr = address;
33043 +
33044 +#ifndef CONFIG_XEN
33045 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
33046 +#endif
33047 +       mp_ioapics[idx].mpc_apicid = id;
33048 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
33049 +       
33050 +       /* 
33051 +        * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
33052 +        * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
33053 +        */
33054 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
33055 +       mp_ioapic_routing[idx].gsi_start = gsi_base;
33056 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
33057 +               io_apic_get_redir_entries(idx);
33058 +
33059 +       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
33060 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
33061 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
33062 +               mp_ioapic_routing[idx].gsi_start,
33063 +               mp_ioapic_routing[idx].gsi_end);
33064 +
33065 +       return;
33066 +}
33067 +
33068 +
33069 +void __init mp_override_legacy_irq (
33070 +       u8                      bus_irq,
33071 +       u8                      polarity, 
33072 +       u8                      trigger, 
33073 +       u32                     gsi)
33074 +{
33075 +       struct mpc_config_intsrc intsrc;
33076 +       int                     ioapic = -1;
33077 +       int                     pin = -1;
33078 +
33079 +       /* 
33080 +        * Convert 'gsi' to 'ioapic.pin'.
33081 +        */
33082 +       ioapic = mp_find_ioapic(gsi);
33083 +       if (ioapic < 0)
33084 +               return;
33085 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
33086 +
33087 +       /*
33088 +        * TBD: This check is for faulty timer entries, where the override
33089 +        *      erroneously sets the trigger to level, resulting in a HUGE 
33090 +        *      increase of timer interrupts!
33091 +        */
33092 +       if ((bus_irq == 0) && (trigger == 3))
33093 +               trigger = 1;
33094 +
33095 +       intsrc.mpc_type = MP_INTSRC;
33096 +       intsrc.mpc_irqtype = mp_INT;
33097 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
33098 +       intsrc.mpc_srcbus = MP_ISA_BUS;
33099 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
33100 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
33101 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
33102 +
33103 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
33104 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
33105 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
33106 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
33107 +
33108 +       mp_irqs[mp_irq_entries] = intsrc;
33109 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
33110 +               panic("Max # of irq sources exceeded!\n");
33111 +
33112 +       return;
33113 +}
33114 +
33115 +
33116 +void __init mp_config_acpi_legacy_irqs (void)
33117 +{
33118 +       struct mpc_config_intsrc intsrc;
33119 +       int                     i = 0;
33120 +       int                     ioapic = -1;
33121 +
33122 +       /* 
33123 +        * Fabricate the legacy ISA bus (bus #31).
33124 +        */
33125 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
33126 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
33127 +
33128 +       /* 
33129 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
33130 +        */
33131 +       ioapic = mp_find_ioapic(0);
33132 +       if (ioapic < 0)
33133 +               return;
33134 +
33135 +       intsrc.mpc_type = MP_INTSRC;
33136 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
33137 +       intsrc.mpc_srcbus = MP_ISA_BUS;
33138 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
33139 +
33140 +       /* 
33141 +        * Use the default configuration for the IRQs 0-15.  Unless
33142 +        * overridden by (MADT) interrupt source override entries.
33143 +        */
33144 +       for (i = 0; i < 16; i++) {
33145 +               int idx;
33146 +
33147 +               for (idx = 0; idx < mp_irq_entries; idx++) {
33148 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
33149 +
33150 +                       /* Do we already have a mapping for this ISA IRQ? */
33151 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
33152 +                               break;
33153 +
33154 +                       /* Do we already have a mapping for this IOAPIC pin */
33155 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
33156 +                               (irq->mpc_dstirq == i))
33157 +                               break;
33158 +               }
33159 +
33160 +               if (idx != mp_irq_entries) {
33161 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
33162 +                       continue;                       /* IRQ already used */
33163 +               }
33164 +
33165 +               intsrc.mpc_irqtype = mp_INT;
33166 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
33167 +               intsrc.mpc_dstirq = i;
33168 +
33169 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
33170 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
33171 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
33172 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
33173 +                       intsrc.mpc_dstirq);
33174 +
33175 +               mp_irqs[mp_irq_entries] = intsrc;
33176 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
33177 +                       panic("Max # of irq sources exceeded!\n");
33178 +       }
33179 +
33180 +       return;
33181 +}
33182 +
33183 +#define MAX_GSI_NUM    4096
33184 +
33185 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
33186 +{
33187 +       int                     ioapic = -1;
33188 +       int                     ioapic_pin = 0;
33189 +       int                     idx, bit = 0;
33190 +       static int              pci_irq = 16;
33191 +       /*
33192 +        * Mapping between Global System Interrupts, which
33193 +        * represent all possible interrupts, to the IRQs
33194 +        * assigned to actual devices.
33195 +        */
33196 +       static int              gsi_to_irq[MAX_GSI_NUM];
33197 +
33198 +       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
33199 +               return gsi;
33200 +
33201 +       /* Don't set up the ACPI SCI because it's already set up */
33202 +       if (acpi_fadt.sci_int == gsi)
33203 +               return gsi;
33204 +
33205 +       ioapic = mp_find_ioapic(gsi);
33206 +       if (ioapic < 0) {
33207 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
33208 +               return gsi;
33209 +       }
33210 +
33211 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
33212 +
33213 +       /* 
33214 +        * Avoid pin reprogramming.  PRTs typically include entries  
33215 +        * with redundant pin->gsi mappings (but unique PCI devices);
33216 +        * we only program the IOAPIC on the first.
33217 +        */
33218 +       bit = ioapic_pin % 32;
33219 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
33220 +       if (idx > 3) {
33221 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
33222 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
33223 +                       ioapic_pin);
33224 +               return gsi;
33225 +       }
33226 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
33227 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
33228 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
33229 +               return gsi_to_irq[gsi];
33230 +       }
33231 +
33232 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
33233 +
33234 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
33235 +               /*
33236 +                * For PCI devices assign IRQs in order, avoiding gaps
33237 +                * due to unused I/O APIC pins.
33238 +                */
33239 +               int irq = gsi;
33240 +               if (gsi < MAX_GSI_NUM) {
33241 +                       if (gsi > 15)
33242 +                               gsi = pci_irq++;
33243 +                       /*
33244 +                        * Don't assign IRQ used by ACPI SCI
33245 +                        */
33246 +                       if (gsi == acpi_fadt.sci_int)
33247 +                               gsi = pci_irq++;
33248 +                       gsi_to_irq[irq] = gsi;
33249 +               } else {
33250 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
33251 +                       return gsi;
33252 +               }
33253 +       }
33254 +
33255 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
33256 +               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
33257 +               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
33258 +       return gsi;
33259 +}
33260 +
33261 +#endif /*CONFIG_X86_IO_APIC*/
33262 +#endif /*CONFIG_ACPI*/
33263 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/pci-swiotlb-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/pci-swiotlb-xen.c
33264 --- ref-linux-2.6.16.9/arch/x86_64/kernel/pci-swiotlb-xen.c     1970-01-01 01:00:00.000000000 +0100
33265 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/pci-swiotlb-xen.c        2006-04-10 00:05:52.000000000 +0200
33266 @@ -0,0 +1,54 @@
33267 +/* Glue code to lib/swiotlb.c */
33268 +
33269 +#include <linux/pci.h>
33270 +#include <linux/cache.h>
33271 +#include <linux/module.h>
33272 +#include <asm/dma-mapping.h>
33273 +#include <asm/proto.h>
33274 +#include <asm/swiotlb.h>
33275 +#include <asm/dma.h>
33276 +
33277 +#if 0
33278 +int swiotlb __read_mostly;
33279 +EXPORT_SYMBOL(swiotlb);
33280 +#endif
33281 +
33282 +struct dma_mapping_ops swiotlb_dma_ops = {
33283 +#if 0
33284 +       .mapping_error = swiotlb_dma_mapping_error,
33285 +       .alloc_coherent = swiotlb_alloc_coherent,
33286 +       .free_coherent = swiotlb_free_coherent,
33287 +       .map_single = swiotlb_map_single,
33288 +       .unmap_single = swiotlb_unmap_single,
33289 +       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
33290 +       .sync_single_for_device = swiotlb_sync_single_for_device,
33291 +       .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
33292 +       .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
33293 +       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
33294 +       .sync_sg_for_device = swiotlb_sync_sg_for_device,
33295 +       .map_sg = swiotlb_map_sg,
33296 +       .unmap_sg = swiotlb_unmap_sg,
33297 +       .dma_supported = NULL,
33298 +#endif
33299 +};
33300 +
33301 +void pci_swiotlb_init(void)
33302 +{
33303 +#if 0
33304 +       /* don't initialize swiotlb if iommu=off (no_iommu=1) */
33305 +       if (!iommu_aperture && !no_iommu &&
33306 +           (end_pfn > MAX_DMA32_PFN || force_iommu))
33307 +              swiotlb = 1;
33308 +       if (swiotlb) {
33309 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
33310 +               swiotlb_init();
33311 +               dma_ops = &swiotlb_dma_ops;
33312 +       }
33313 +#else
33314 +       swiotlb_init();
33315 +       if (swiotlb) {
33316 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
33317 +               dma_ops = &swiotlb_dma_ops;
33318 +       }
33319 +#endif
33320 +}
33321 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/process-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/process-xen.c
33322 --- ref-linux-2.6.16.9/arch/x86_64/kernel/process-xen.c 1970-01-01 01:00:00.000000000 +0100
33323 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/process-xen.c    2006-04-10 00:05:52.000000000 +0200
33324 @@ -0,0 +1,810 @@
33325 +/*
33326 + *  linux/arch/x86-64/kernel/process.c
33327 + *
33328 + *  Copyright (C) 1995  Linus Torvalds
33329 + *
33330 + *  Pentium III FXSR, SSE support
33331 + *     Gareth Hughes <gareth@valinux.com>, May 2000
33332 + * 
33333 + *  X86-64 port
33334 + *     Andi Kleen.
33335 + *
33336 + *     CPU hotplug support - ashok.raj@intel.com
33337 + *  $Id$
33338 + * 
33339 + *  Jun Nakajima <jun.nakajima@intel.com> 
33340 + *     Modified for Xen
33341 + */
33342 +
33343 +/*
33344 + * This file handles the architecture-dependent parts of process handling..
33345 + */
33346 +
33347 +#include <stdarg.h>
33348 +
33349 +#include <linux/cpu.h>
33350 +#include <linux/errno.h>
33351 +#include <linux/sched.h>
33352 +#include <linux/kernel.h>
33353 +#include <linux/mm.h>
33354 +#include <linux/elfcore.h>
33355 +#include <linux/smp.h>
33356 +#include <linux/slab.h>
33357 +#include <linux/user.h>
33358 +#include <linux/module.h>
33359 +#include <linux/a.out.h>
33360 +#include <linux/interrupt.h>
33361 +#include <linux/delay.h>
33362 +#include <linux/ptrace.h>
33363 +#include <linux/utsname.h>
33364 +#include <linux/random.h>
33365 +#include <linux/kprobes.h>
33366 +#include <linux/notifier.h>
33367 +
33368 +#include <asm/uaccess.h>
33369 +#include <asm/pgtable.h>
33370 +#include <asm/system.h>
33371 +#include <asm/io.h>
33372 +#include <asm/processor.h>
33373 +#include <asm/i387.h>
33374 +#include <asm/mmu_context.h>
33375 +#include <asm/pda.h>
33376 +#include <asm/prctl.h>
33377 +#include <asm/kdebug.h>
33378 +#include <xen/interface/dom0_ops.h>
33379 +#include <xen/interface/physdev.h>
33380 +#include <xen/interface/vcpu.h>
33381 +#include <asm/desc.h>
33382 +#include <asm/proto.h>
33383 +#include <asm/hardirq.h>
33384 +#include <asm/ia32.h>
33385 +#include <asm/idle.h>
33386 +
33387 +asmlinkage extern void ret_from_fork(void);
33388 +
33389 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
33390 +
33391 +unsigned long boot_option_idle_override = 0;
33392 +EXPORT_SYMBOL(boot_option_idle_override);
33393 +
33394 +/*
33395 + * Powermanagement idle function, if any..
33396 + */
33397 +void (*pm_idle)(void);
33398 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
33399 +
33400 +static struct notifier_block *idle_notifier;
33401 +static DEFINE_SPINLOCK(idle_notifier_lock);
33402 +
33403 +void idle_notifier_register(struct notifier_block *n)
33404 +{
33405 +       unsigned long flags;
33406 +       spin_lock_irqsave(&idle_notifier_lock, flags);
33407 +       notifier_chain_register(&idle_notifier, n);
33408 +       spin_unlock_irqrestore(&idle_notifier_lock, flags);
33409 +}
33410 +EXPORT_SYMBOL_GPL(idle_notifier_register);
33411 +
33412 +void idle_notifier_unregister(struct notifier_block *n)
33413 +{
33414 +       unsigned long flags;
33415 +       spin_lock_irqsave(&idle_notifier_lock, flags);
33416 +       notifier_chain_unregister(&idle_notifier, n);
33417 +       spin_unlock_irqrestore(&idle_notifier_lock, flags);
33418 +}
33419 +EXPORT_SYMBOL(idle_notifier_unregister);
33420 +
33421 +enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
33422 +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
33423 +
33424 +void enter_idle(void)
33425 +{
33426 +       __get_cpu_var(idle_state) = CPU_IDLE;
33427 +       notifier_call_chain(&idle_notifier, IDLE_START, NULL);
33428 +}
33429 +
33430 +static void __exit_idle(void)
33431 +{
33432 +       __get_cpu_var(idle_state) = CPU_NOT_IDLE;
33433 +       notifier_call_chain(&idle_notifier, IDLE_END, NULL);
33434 +}
33435 +
33436 +/* Called from interrupts to signify idle end */
33437 +void exit_idle(void)
33438 +{
33439 +       if (current->pid | read_pda(irqcount))
33440 +               return;
33441 +       __exit_idle();
33442 +}
33443 +
33444 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
33445 +extern void stop_hz_timer(void);
33446 +extern void start_hz_timer(void);
33447 +void xen_idle(void)
33448 +{
33449 +       local_irq_disable();
33450 +
33451 +       if (need_resched())
33452 +               local_irq_enable();
33453 +       else {
33454 +               clear_thread_flag(TIF_POLLING_NRFLAG);
33455 +               smp_mb__after_clear_bit();
33456 +               stop_hz_timer();
33457 +               /* Blocking includes an implicit local_irq_enable(). */
33458 +               HYPERVISOR_block();
33459 +               start_hz_timer();
33460 +               set_thread_flag(TIF_POLLING_NRFLAG);
33461 +       }
33462 +}
33463 +
33464 +#ifdef CONFIG_HOTPLUG_CPU
33465 +static inline void play_dead(void)
33466 +{
33467 +       idle_task_exit();
33468 +       local_irq_disable();
33469 +       cpu_clear(smp_processor_id(), cpu_initialized);
33470 +       preempt_enable_no_resched();
33471 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
33472 +       /* Same as drivers/xen/core/smpboot.c:cpu_bringup(). */
33473 +       cpu_init();
33474 +       touch_softlockup_watchdog();
33475 +       preempt_disable();
33476 +       local_irq_enable();
33477 +}
33478 +#else
33479 +static inline void play_dead(void)
33480 +{
33481 +       BUG();
33482 +}
33483 +#endif /* CONFIG_HOTPLUG_CPU */
33484 +
33485 +/*
33486 + * The idle thread. There's no useful work to be
33487 + * done, so just try to conserve power and have a
33488 + * low exit latency (ie sit in a loop waiting for
33489 + * somebody to say that they'd like to reschedule)
33490 + */
33491 +void cpu_idle (void)
33492 +{
33493 +       set_thread_flag(TIF_POLLING_NRFLAG);
33494 +
33495 +       /* endless idle loop with no priority at all */
33496 +       while (1) {
33497 +               while (!need_resched()) {
33498 +                       if (__get_cpu_var(cpu_idle_state))
33499 +                               __get_cpu_var(cpu_idle_state) = 0;
33500 +                       rmb();
33501 +                       
33502 +                       if (cpu_is_offline(smp_processor_id()))
33503 +                               play_dead();
33504 +                       enter_idle();
33505 +                       xen_idle();
33506 +                       __exit_idle();
33507 +               }
33508 +
33509 +               preempt_enable_no_resched();
33510 +               schedule();
33511 +               preempt_disable();
33512 +       }
33513 +}
33514 +
33515 +void cpu_idle_wait(void)
33516 +{
33517 +       unsigned int cpu, this_cpu = get_cpu();
33518 +       cpumask_t map;
33519 +
33520 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
33521 +       put_cpu();
33522 +
33523 +       cpus_clear(map);
33524 +       for_each_online_cpu(cpu) {
33525 +               per_cpu(cpu_idle_state, cpu) = 1;
33526 +               cpu_set(cpu, map);
33527 +       }
33528 +
33529 +       __get_cpu_var(cpu_idle_state) = 0;
33530 +
33531 +       wmb();
33532 +       do {
33533 +               ssleep(1);
33534 +               for_each_online_cpu(cpu) {
33535 +                       if (cpu_isset(cpu, map) &&
33536 +                                       !per_cpu(cpu_idle_state, cpu))
33537 +                               cpu_clear(cpu, map);
33538 +               }
33539 +               cpus_and(map, map, cpu_online_map);
33540 +       } while (!cpus_empty(map));
33541 +}
33542 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
33543 +
33544 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
33545 +/* Always use xen_idle() instead. */
33546 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
33547 +
33548 +/* Prints also some state that isn't saved in the pt_regs */ 
33549 +void __show_regs(struct pt_regs * regs)
33550 +{
33551 +       unsigned long fs, gs, shadowgs;
33552 +       unsigned int fsindex,gsindex;
33553 +       unsigned int ds,cs,es; 
33554 +
33555 +       printk("\n");
33556 +       print_modules();
33557 +       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
33558 +               current->pid, current->comm, print_tainted(),
33559 +               system_utsname.release,
33560 +               (int)strcspn(system_utsname.version, " "),
33561 +               system_utsname.version);
33562 +       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
33563 +       printk_address(regs->rip); 
33564 +       printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
33565 +               regs->eflags);
33566 +       printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
33567 +              regs->rax, regs->rbx, regs->rcx);
33568 +       printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
33569 +              regs->rdx, regs->rsi, regs->rdi); 
33570 +       printk("RBP: %016lx R08: %016lx R09: %016lx\n",
33571 +              regs->rbp, regs->r8, regs->r9); 
33572 +       printk("R10: %016lx R11: %016lx R12: %016lx\n",
33573 +              regs->r10, regs->r11, regs->r12); 
33574 +       printk("R13: %016lx R14: %016lx R15: %016lx\n",
33575 +              regs->r13, regs->r14, regs->r15); 
33576 +
33577 +       asm("mov %%ds,%0" : "=r" (ds)); 
33578 +       asm("mov %%cs,%0" : "=r" (cs)); 
33579 +       asm("mov %%es,%0" : "=r" (es)); 
33580 +       asm("mov %%fs,%0" : "=r" (fsindex));
33581 +       asm("mov %%gs,%0" : "=r" (gsindex));
33582 +
33583 +       rdmsrl(MSR_FS_BASE, fs);
33584 +       rdmsrl(MSR_GS_BASE, gs); 
33585 +       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
33586 +
33587 +       printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
33588 +              fs,fsindex,gs,gsindex,shadowgs); 
33589 +       printk("CS:  %04x DS: %04x ES: %04x\n", cs, ds, es); 
33590 +
33591 +}
33592 +
33593 +void show_regs(struct pt_regs *regs)
33594 +{
33595 +       printk("CPU %d:", smp_processor_id());
33596 +       __show_regs(regs);
33597 +       show_trace(&regs->rsp);
33598 +}
33599 +
33600 +/*
33601 + * Free current thread data structures etc..
33602 + */
33603 +void exit_thread(void)
33604 +{
33605 +       struct task_struct *me = current;
33606 +       struct thread_struct *t = &me->thread;
33607 +
33608 +       /*
33609 +        * Remove function-return probe instances associated with this task
33610 +        * and put them back on the free list. Do not insert an exit probe for
33611 +        * this function, it will be disabled by kprobe_flush_task if you do.
33612 +        */
33613 +       kprobe_flush_task(me);
33614 +
33615 +       if (me->thread.io_bitmap_ptr) { 
33616 +#ifndef CONFIG_X86_NO_TSS
33617 +               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
33618 +#endif
33619 +#ifdef CONFIG_XEN
33620 +               static physdev_op_t iobmp_op = {
33621 +                       .cmd = PHYSDEVOP_SET_IOBITMAP
33622 +               };
33623 +#endif
33624 +
33625 +               kfree(t->io_bitmap_ptr);
33626 +               t->io_bitmap_ptr = NULL;
33627 +               /*
33628 +                * Careful, clear this in the TSS too:
33629 +                */
33630 +#ifndef CONFIG_X86_NO_TSS
33631 +               memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
33632 +               put_cpu();
33633 +#endif
33634 +#ifdef CONFIG_XEN
33635 +               HYPERVISOR_physdev_op(&iobmp_op);
33636 +#endif
33637 +               t->io_bitmap_max = 0;
33638 +       }
33639 +}
33640 +
33641 +void load_gs_index(unsigned gs)
33642 +{
33643 +       HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
33644 +}
33645 +
33646 +void flush_thread(void)
33647 +{
33648 +       struct task_struct *tsk = current;
33649 +       struct thread_info *t = current_thread_info();
33650 +
33651 +       if (t->flags & _TIF_ABI_PENDING)
33652 +               t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
33653 +
33654 +       tsk->thread.debugreg0 = 0;
33655 +       tsk->thread.debugreg1 = 0;
33656 +       tsk->thread.debugreg2 = 0;
33657 +       tsk->thread.debugreg3 = 0;
33658 +       tsk->thread.debugreg6 = 0;
33659 +       tsk->thread.debugreg7 = 0;
33660 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
33661 +       /*
33662 +        * Forget coprocessor state..
33663 +        */
33664 +       clear_fpu(tsk);
33665 +       clear_used_math();
33666 +}
33667 +
33668 +void release_thread(struct task_struct *dead_task)
33669 +{
33670 +       if (dead_task->mm) {
33671 +               if (dead_task->mm->context.size) {
33672 +                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
33673 +                                       dead_task->comm,
33674 +                                       dead_task->mm->context.ldt,
33675 +                                       dead_task->mm->context.size);
33676 +                       BUG();
33677 +               }
33678 +       }
33679 +}
33680 +
33681 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
33682 +{
33683 +       struct user_desc ud = { 
33684 +               .base_addr = addr,
33685 +               .limit = 0xfffff,
33686 +               .contents = (3 << 3), /* user */
33687 +               .seg_32bit = 1,
33688 +               .limit_in_pages = 1,
33689 +               .useable = 1,
33690 +       };
33691 +       struct n_desc_struct *desc = (void *)t->thread.tls_array;
33692 +       desc += tls;
33693 +       desc->a = LDT_entry_a(&ud); 
33694 +       desc->b = LDT_entry_b(&ud); 
33695 +}
33696 +
33697 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
33698 +{
33699 +       struct desc_struct *desc = (void *)t->thread.tls_array;
33700 +       desc += tls;
33701 +       return desc->base0 | 
33702 +               (((u32)desc->base1) << 16) | 
33703 +               (((u32)desc->base2) << 24);
33704 +}
33705 +
33706 +/*
33707 + * This gets called before we allocate a new thread and copy
33708 + * the current task into it.
33709 + */
33710 +void prepare_to_copy(struct task_struct *tsk)
33711 +{
33712 +       unlazy_fpu(tsk);
33713 +}
33714 +
33715 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
33716 +               unsigned long unused,
33717 +       struct task_struct * p, struct pt_regs * regs)
33718 +{
33719 +       int err;
33720 +       struct pt_regs * childregs;
33721 +       struct task_struct *me = current;
33722 +
33723 +       childregs = ((struct pt_regs *)
33724 +                       (THREAD_SIZE + task_stack_page(p))) - 1;
33725 +       *childregs = *regs;
33726 +
33727 +       childregs->rax = 0;
33728 +       childregs->rsp = rsp;
33729 +       if (rsp == ~0UL)
33730 +               childregs->rsp = (unsigned long)childregs;
33731 +
33732 +       p->thread.rsp = (unsigned long) childregs;
33733 +       p->thread.rsp0 = (unsigned long) (childregs+1);
33734 +       p->thread.userrsp = me->thread.userrsp; 
33735 +
33736 +       set_tsk_thread_flag(p, TIF_FORK);
33737 +
33738 +       p->thread.fs = me->thread.fs;
33739 +       p->thread.gs = me->thread.gs;
33740 +
33741 +       asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
33742 +       asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
33743 +       asm("mov %%es,%0" : "=m" (p->thread.es));
33744 +       asm("mov %%ds,%0" : "=m" (p->thread.ds));
33745 +
33746 +       if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
33747 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
33748 +               if (!p->thread.io_bitmap_ptr) {
33749 +                       p->thread.io_bitmap_max = 0;
33750 +                       return -ENOMEM;
33751 +               }
33752 +               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
33753 +                               IO_BITMAP_BYTES);
33754 +       } 
33755 +
33756 +       /*
33757 +        * Set a new TLS for the child thread?
33758 +        */
33759 +       if (clone_flags & CLONE_SETTLS) {
33760 +#ifdef CONFIG_IA32_EMULATION
33761 +               if (test_thread_flag(TIF_IA32))
33762 +                       err = ia32_child_tls(p, childregs); 
33763 +               else                    
33764 +#endif  
33765 +                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
33766 +               if (err) 
33767 +                       goto out;
33768 +       }
33769 +        p->thread.iopl = current->thread.iopl;
33770 +
33771 +       err = 0;
33772 +out:
33773 +       if (err && p->thread.io_bitmap_ptr) {
33774 +               kfree(p->thread.io_bitmap_ptr);
33775 +               p->thread.io_bitmap_max = 0;
33776 +       }
33777 +       return err;
33778 +}
33779 +
33780 +static inline void __save_init_fpu( struct task_struct *tsk )
33781 +{
33782 +       asm volatile( "rex64 ; fxsave %0 ; fnclex"
33783 +                     : "=m" (tsk->thread.i387.fxsave));
33784 +       tsk->thread_info->status &= ~TS_USEDFPU;
33785 +}
33786 +
33787 +/*
33788 + *     switch_to(x,y) should switch tasks from x to y.
33789 + *
33790 + * This could still be optimized: 
33791 + * - fold all the options into a flag word and test it with a single test.
33792 + * - could test fs/gs bitsliced
33793 + *
33794 + * Kprobes not supported here. Set the probe on schedule instead.
33795 + */
33796 +__kprobes struct task_struct *
33797 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
33798 +{
33799 +       struct thread_struct *prev = &prev_p->thread,
33800 +                                *next = &next_p->thread;
33801 +       int cpu = smp_processor_id();  
33802 +#ifndef CONFIG_X86_NO_TSS
33803 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
33804 +#endif
33805 +       physdev_op_t iopl_op, iobmp_op;
33806 +       multicall_entry_t _mcl[8], *mcl = _mcl;
33807 +
33808 +       /*
33809 +        * This is basically '__unlazy_fpu', except that we queue a
33810 +        * multicall to indicate FPU task switch, rather than
33811 +        * synchronously trapping to Xen.
33812 +        */
33813 +       if (prev_p->thread_info->status & TS_USEDFPU) {
33814 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
33815 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
33816 +               mcl->args[0] = 1;
33817 +               mcl++;
33818 +       }
33819 +
33820 +       /*
33821 +        * Reload esp0, LDT and the page table pointer:
33822 +        */
33823 +       mcl->op      = __HYPERVISOR_stack_switch;
33824 +       mcl->args[0] = __KERNEL_DS;
33825 +       mcl->args[1] = next->rsp0;
33826 +       mcl++;
33827 +
33828 +       /*
33829 +        * Load the per-thread Thread-Local Storage descriptor.
33830 +        * This is load_TLS(next, cpu) with multicalls.
33831 +        */
33832 +#define C(i) do {                                                      \
33833 +       if (unlikely(next->tls_array[i] != prev->tls_array[i])) {       \
33834 +               mcl->op      = __HYPERVISOR_update_descriptor;          \
33835 +               mcl->args[0] = virt_to_machine(                         \
33836 +                       &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);          \
33837 +               mcl->args[1] = next->tls_array[i];                      \
33838 +               mcl++;                                                  \
33839 +       }                                                               \
33840 +} while (0)
33841 +       C(0); C(1); C(2);
33842 +#undef C
33843 +
33844 +       if (unlikely(prev->iopl != next->iopl)) {
33845 +               iopl_op.cmd             = PHYSDEVOP_SET_IOPL;
33846 +               iopl_op.u.set_iopl.iopl = (next->iopl == 0) ? 1 : next->iopl;
33847 +               mcl->op      = __HYPERVISOR_physdev_op;
33848 +               mcl->args[0] = (unsigned long)&iopl_op;
33849 +               mcl++;
33850 +       }
33851 +
33852 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
33853 +               iobmp_op.cmd                     =
33854 +                       PHYSDEVOP_SET_IOBITMAP;
33855 +               iobmp_op.u.set_iobitmap.bitmap   =
33856 +                       (char *)next->io_bitmap_ptr;
33857 +               iobmp_op.u.set_iobitmap.nr_ports =
33858 +                       next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
33859 +               mcl->op      = __HYPERVISOR_physdev_op;
33860 +               mcl->args[0] = (unsigned long)&iobmp_op;
33861 +               mcl++;
33862 +       }
33863 +
33864 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
33865 +       /* 
33866 +        * Switch DS and ES.
33867 +        * This won't pick up thread selector changes, but I guess that is ok.
33868 +        */
33869 +       if (unlikely(next->es))
33870 +               loadsegment(es, next->es); 
33871 +       
33872 +       if (unlikely(next->ds))
33873 +               loadsegment(ds, next->ds);
33874 +
33875 +       /* 
33876 +        * Switch FS and GS.
33877 +        */
33878 +       if (unlikely(next->fsindex))
33879 +               loadsegment(fs, next->fsindex);
33880 +
33881 +       if (next->fs)
33882 +               HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs); 
33883 +       
33884 +       if (unlikely(next->gsindex))
33885 +               load_gs_index(next->gsindex);
33886 +
33887 +       if (next->gs)
33888 +               HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs); 
33889 +
33890 +       /* 
33891 +        * Switch the PDA context.
33892 +        */
33893 +       prev->userrsp = read_pda(oldrsp); 
33894 +       write_pda(oldrsp, next->userrsp); 
33895 +       write_pda(pcurrent, next_p); 
33896 +       write_pda(kernelstack,
33897 +                 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
33898 +
33899 +       /*
33900 +        * Now maybe reload the debug registers
33901 +        */
33902 +       if (unlikely(next->debugreg7)) {
33903 +               set_debugreg(next->debugreg0, 0);
33904 +               set_debugreg(next->debugreg1, 1);
33905 +               set_debugreg(next->debugreg2, 2);
33906 +               set_debugreg(next->debugreg3, 3);
33907 +               /* no 4 and 5 */
33908 +               set_debugreg(next->debugreg6, 6);
33909 +               set_debugreg(next->debugreg7, 7);
33910 +       }
33911 +
33912 +       return prev_p;
33913 +}
33914 +
33915 +/*
33916 + * sys_execve() executes a new program.
33917 + */
33918 +asmlinkage 
33919 +long sys_execve(char __user *name, char __user * __user *argv,
33920 +               char __user * __user *envp, struct pt_regs regs)
33921 +{
33922 +       long error;
33923 +       char * filename;
33924 +
33925 +       filename = getname(name);
33926 +       error = PTR_ERR(filename);
33927 +       if (IS_ERR(filename)) 
33928 +               return error;
33929 +       error = do_execve(filename, argv, envp, &regs); 
33930 +       if (error == 0) {
33931 +               task_lock(current);
33932 +               current->ptrace &= ~PT_DTRACE;
33933 +               task_unlock(current);
33934 +       }
33935 +       putname(filename);
33936 +       return error;
33937 +}
33938 +
33939 +void set_personality_64bit(void)
33940 +{
33941 +       /* inherit personality from parent */
33942 +
33943 +       /* Make sure to be in 64bit mode */
33944 +       clear_thread_flag(TIF_IA32); 
33945 +
33946 +       /* TBD: overwrites user setup. Should have two bits.
33947 +          But 64bit processes have always behaved this way,
33948 +          so it's not too bad. The main problem is just that
33949 +          32bit childs are affected again. */
33950 +       current->personality &= ~READ_IMPLIES_EXEC;
33951 +}
33952 +
33953 +asmlinkage long sys_fork(struct pt_regs *regs)
33954 +{
33955 +       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
33956 +}
33957 +
33958 +asmlinkage long
33959 +sys_clone(unsigned long clone_flags, unsigned long newsp,
33960 +         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
33961 +{
33962 +       if (!newsp)
33963 +               newsp = regs->rsp;
33964 +       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
33965 +}
33966 +
33967 +/*
33968 + * This is trivial, and on the face of it looks like it
33969 + * could equally well be done in user mode.
33970 + *
33971 + * Not so, for quite unobvious reasons - register pressure.
33972 + * In user mode vfork() cannot have a stack frame, and if
33973 + * done by calling the "clone()" system call directly, you
33974 + * do not have enough call-clobbered registers to hold all
33975 + * the information you need.
33976 + */
33977 +asmlinkage long sys_vfork(struct pt_regs *regs)
33978 +{
33979 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
33980 +                   NULL, NULL);
33981 +}
33982 +
33983 +unsigned long get_wchan(struct task_struct *p)
33984 +{
33985 +       unsigned long stack;
33986 +       u64 fp,rip;
33987 +       int count = 0;
33988 +
33989 +       if (!p || p == current || p->state==TASK_RUNNING)
33990 +               return 0; 
33991 +       stack = (unsigned long)task_stack_page(p);
33992 +       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
33993 +               return 0;
33994 +       fp = *(u64 *)(p->thread.rsp);
33995 +       do { 
33996 +               if (fp < (unsigned long)stack ||
33997 +                   fp > (unsigned long)stack+THREAD_SIZE)
33998 +                       return 0; 
33999 +               rip = *(u64 *)(fp+8); 
34000 +               if (!in_sched_functions(rip))
34001 +                       return rip; 
34002 +               fp = *(u64 *)fp; 
34003 +       } while (count++ < 16); 
34004 +       return 0;
34005 +}
34006 +
34007 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
34008 +{ 
34009 +       int ret = 0; 
34010 +       int doit = task == current;
34011 +       int cpu;
34012 +
34013 +       switch (code) { 
34014 +       case ARCH_SET_GS:
34015 +               if (addr >= TASK_SIZE_OF(task))
34016 +                       return -EPERM; 
34017 +               cpu = get_cpu();
34018 +               /* handle small bases via the GDT because that's faster to 
34019 +                  switch. */
34020 +               if (addr <= 0xffffffff) {  
34021 +                       set_32bit_tls(task, GS_TLS, addr); 
34022 +                       if (doit) { 
34023 +                               load_TLS(&task->thread, cpu);
34024 +                               load_gs_index(GS_TLS_SEL); 
34025 +                       }
34026 +                       task->thread.gsindex = GS_TLS_SEL; 
34027 +                       task->thread.gs = 0;
34028 +               } else { 
34029 +                       task->thread.gsindex = 0;
34030 +                       task->thread.gs = addr;
34031 +                       if (doit) {
34032 +                               load_gs_index(0);
34033 +                               ret = HYPERVISOR_set_segment_base(
34034 +                                       SEGBASE_GS_USER, addr);
34035 +                       } 
34036 +               }
34037 +               put_cpu();
34038 +               break;
34039 +       case ARCH_SET_FS:
34040 +               /* Not strictly needed for fs, but do it for symmetry
34041 +                  with gs */
34042 +               if (addr >= TASK_SIZE_OF(task))
34043 +                       return -EPERM; 
34044 +               cpu = get_cpu();
34045 +               /* handle small bases via the GDT because that's faster to 
34046 +                  switch. */
34047 +               if (addr <= 0xffffffff) { 
34048 +                       set_32bit_tls(task, FS_TLS, addr);
34049 +                       if (doit) { 
34050 +                               load_TLS(&task->thread, cpu); 
34051 +                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
34052 +                       }
34053 +                       task->thread.fsindex = FS_TLS_SEL;
34054 +                       task->thread.fs = 0;
34055 +               } else { 
34056 +                       task->thread.fsindex = 0;
34057 +                       task->thread.fs = addr;
34058 +                       if (doit) {
34059 +                               /* set the selector to 0 to not confuse
34060 +                                  __switch_to */
34061 +                               asm volatile("movl %0,%%fs" :: "r" (0));
34062 +                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
34063 +                                                                 addr);
34064 +                       }
34065 +               }
34066 +               put_cpu();
34067 +               break;
34068 +       case ARCH_GET_FS: { 
34069 +               unsigned long base; 
34070 +               if (task->thread.fsindex == FS_TLS_SEL)
34071 +                       base = read_32bit_tls(task, FS_TLS);
34072 +               else if (doit)
34073 +                       rdmsrl(MSR_FS_BASE, base);
34074 +               else
34075 +                       base = task->thread.fs;
34076 +               ret = put_user(base, (unsigned long __user *)addr); 
34077 +               break; 
34078 +       }
34079 +       case ARCH_GET_GS: { 
34080 +               unsigned long base;
34081 +               if (task->thread.gsindex == GS_TLS_SEL)
34082 +                       base = read_32bit_tls(task, GS_TLS);
34083 +               else if (doit)
34084 +                       rdmsrl(MSR_KERNEL_GS_BASE, base);
34085 +               else
34086 +                       base = task->thread.gs;
34087 +               ret = put_user(base, (unsigned long __user *)addr); 
34088 +               break;
34089 +       }
34090 +
34091 +       default:
34092 +               ret = -EINVAL;
34093 +               break;
34094 +       } 
34095 +
34096 +       return ret;     
34097 +} 
34098 +
34099 +long sys_arch_prctl(int code, unsigned long addr)
34100 +{
34101 +       return do_arch_prctl(current, code, addr);
34102 +} 
34103 +
34104 +/* 
34105 + * Capture the user space registers if the task is not running (in user space)
34106 + */
34107 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
34108 +{
34109 +       struct pt_regs *pp, ptregs;
34110 +
34111 +       pp = task_pt_regs(tsk);
34112 +
34113 +       ptregs = *pp; 
34114 +       ptregs.cs &= 0xffff;
34115 +       ptregs.ss &= 0xffff;
34116 +
34117 +       elf_core_copy_regs(regs, &ptregs);
34118
34119 +        boot_option_idle_override = 1;
34120 +       return 1;
34121 +}
34122 +
34123 +unsigned long arch_align_stack(unsigned long sp)
34124 +{
34125 +       if (randomize_va_space)
34126 +               sp -= get_random_int() % 8192;
34127 +       return sp & ~0xf;
34128 +}
34129 +
34130 +#ifndef CONFIG_SMP
34131 +void _restore_vcpu(void)
34132 +{
34133 +}
34134 +#endif
34135 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/setup64-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/setup64-xen.c
34136 --- ref-linux-2.6.16.9/arch/x86_64/kernel/setup64-xen.c 1970-01-01 01:00:00.000000000 +0100
34137 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/setup64-xen.c    2006-04-10 00:05:52.000000000 +0200
34138 @@ -0,0 +1,360 @@
34139 +/* 
34140 + * X86-64 specific CPU setup.
34141 + * Copyright (C) 1995  Linus Torvalds
34142 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
34143 + * See setup.c for older changelog.
34144 + * $Id$
34145 + *
34146 + * Jun Nakajima <jun.nakajima@intel.com> 
34147 + *   Modified for Xen
34148 + *
34149 + */ 
34150 +#include <linux/config.h>
34151 +#include <linux/init.h>
34152 +#include <linux/kernel.h>
34153 +#include <linux/sched.h>
34154 +#include <linux/string.h>
34155 +#include <linux/bootmem.h>
34156 +#include <linux/bitops.h>
34157 +#include <linux/module.h>
34158 +#include <asm/bootsetup.h>
34159 +#include <asm/pda.h>
34160 +#include <asm/pgtable.h>
34161 +#include <asm/processor.h>
34162 +#include <asm/desc.h>
34163 +#include <asm/atomic.h>
34164 +#include <asm/mmu_context.h>
34165 +#include <asm/smp.h>
34166 +#include <asm/i387.h>
34167 +#include <asm/percpu.h>
34168 +#include <asm/proto.h>
34169 +#include <asm/sections.h>
34170 +#ifdef CONFIG_XEN
34171 +#include <asm/hypervisor.h>
34172 +#endif
34173 +
34174 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
34175 +
34176 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
34177 +
34178 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
34179 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
34180 +
34181 +#ifndef CONFIG_X86_NO_IDT
34182 +struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 
34183 +#endif
34184 +
34185 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
34186 +
34187 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
34188 +static int do_not_nx __cpuinitdata = 0;
34189 +
34190 +/* noexec=on|off
34191 +Control non executable mappings for 64bit processes.
34192 +
34193 +on     Enable(default)
34194 +off    Disable
34195 +*/ 
34196 +int __init nonx_setup(char *str)
34197 +{
34198 +       if (!strncmp(str, "on", 2)) {
34199 +                __supported_pte_mask |= _PAGE_NX; 
34200 +               do_not_nx = 0; 
34201 +       } else if (!strncmp(str, "off", 3)) {
34202 +               do_not_nx = 1;
34203 +               __supported_pte_mask &= ~_PAGE_NX;
34204 +        }
34205 +       return 0;
34206 +} 
34207 +__setup("noexec=", nonx_setup);        /* parsed early actually */
34208 +
34209 +int force_personality32 = READ_IMPLIES_EXEC;
34210 +
34211 +/* noexec32=on|off
34212 +Control non executable heap for 32bit processes.
34213 +To control the stack too use noexec=off
34214 +
34215 +on     PROT_READ does not imply PROT_EXEC for 32bit processes
34216 +off    PROT_READ implies PROT_EXEC (default)
34217 +*/
34218 +static int __init nonx32_setup(char *str)
34219 +{
34220 +       if (!strcmp(str, "on"))
34221 +               force_personality32 &= ~READ_IMPLIES_EXEC;
34222 +       else if (!strcmp(str, "off"))
34223 +               force_personality32 |= READ_IMPLIES_EXEC;
34224 +       return 0;
34225 +}
34226 +__setup("noexec32=", nonx32_setup);
34227 +
34228 +/*
34229 + * Great future plan:
34230 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
34231 + * Always point %gs to its beginning
34232 + */
34233 +void __init setup_per_cpu_areas(void)
34234 +{ 
34235 +       int i;
34236 +       unsigned long size;
34237 +
34238 +#ifdef CONFIG_HOTPLUG_CPU
34239 +       prefill_possible_map();
34240 +#endif
34241 +
34242 +       /* Copy section for each CPU (we discard the original) */
34243 +       size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
34244 +#ifdef CONFIG_MODULES
34245 +       if (size < PERCPU_ENOUGH_ROOM)
34246 +               size = PERCPU_ENOUGH_ROOM;
34247 +#endif
34248 +
34249 +       for_each_cpu_mask (i, cpu_possible_map) {
34250 +               char *ptr;
34251 +
34252 +               if (!NODE_DATA(cpu_to_node(i))) {
34253 +                       printk("cpu with no node %d, num_online_nodes %d\n",
34254 +                              i, num_online_nodes());
34255 +                       ptr = alloc_bootmem(size);
34256 +               } else { 
34257 +                       ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
34258 +               }
34259 +               if (!ptr)
34260 +                       panic("Cannot allocate cpu data for CPU %d\n", i);
34261 +               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
34262 +               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
34263 +       }
34264 +} 
34265 +
34266 +#ifdef CONFIG_XEN
34267 +static void switch_pt(void)
34268 +{
34269 +       xen_pt_switch(__pa(init_level4_pgt));
34270 +        xen_new_user_pt(__pa(init_level4_user_pgt));
34271 +}
34272 +
34273 +void __cpuinit cpu_gdt_init(struct desc_ptr *gdt_descr)
34274 +{
34275 +       unsigned long frames[16];
34276 +       unsigned long va;
34277 +       int f;
34278 +
34279 +       for (va = gdt_descr->address, f = 0;
34280 +            va < gdt_descr->address + gdt_descr->size;
34281 +            va += PAGE_SIZE, f++) {
34282 +               frames[f] = virt_to_mfn(va);
34283 +               make_page_readonly(
34284 +                       (void *)va, XENFEAT_writable_descriptor_tables);
34285 +       }
34286 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size /
34287 +                               sizeof (struct desc_struct)))
34288 +               BUG();
34289 +}
34290 +#else
34291 +static void switch_pt(void)
34292 +{
34293 +       asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
34294 +}
34295 +
34296 +void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
34297 +{
34298 +       asm volatile("lgdt %0" :: "m" (*gdt_descr));
34299 +       asm volatile("lidt %0" :: "m" (idt_descr));
34300 +}
34301 +#endif
34302 +
34303 +void pda_init(int cpu)
34304 +{ 
34305 +       struct x8664_pda *pda = cpu_pda(cpu);
34306 +
34307 +       /* Setup up data that may be needed in __get_free_pages early */
34308 +       asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
34309 +#ifndef CONFIG_XEN
34310 +       wrmsrl(MSR_GS_BASE, pda);
34311 +#else
34312 +       HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda);
34313 +#endif
34314 +       pda->cpunumber = cpu; 
34315 +       pda->irqcount = -1;
34316 +       pda->kernelstack = 
34317 +               (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
34318 +       pda->active_mm = &init_mm;
34319 +       pda->mmu_state = 0;
34320 +
34321 +       if (cpu == 0) {
34322 +#ifdef CONFIG_XEN
34323 +               xen_init_pt();
34324 +#endif
34325 +               /* others are initialized in smpboot.c */
34326 +               pda->pcurrent = &init_task;
34327 +               pda->irqstackptr = boot_cpu_stack; 
34328 +       } else {
34329 +               pda->irqstackptr = (char *)
34330 +                       __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
34331 +               if (!pda->irqstackptr)
34332 +                       panic("cannot allocate irqstack for cpu %d", cpu); 
34333 +       }
34334 +
34335 +       switch_pt();
34336 +
34337 +       pda->irqstackptr += IRQSTACKSIZE-64;
34338 +} 
34339 +
34340 +#ifndef CONFIG_X86_NO_TSS
34341 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
34342 +__attribute__((section(".bss.page_aligned")));
34343 +#endif
34344 +
34345 +/* May not be marked __init: used by software suspend */
34346 +void syscall_init(void)
34347 +{
34348 +#ifndef CONFIG_XEN
34349 +       /* 
34350 +        * LSTAR and STAR live in a bit strange symbiosis.
34351 +        * They both write to the same internal register. STAR allows to set CS/DS
34352 +        * but only a 32bit target. LSTAR sets the 64bit rip.    
34353 +        */ 
34354 +       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
34355 +       wrmsrl(MSR_LSTAR, system_call); 
34356 +
34357 +       /* Flags to clear on syscall */
34358 +       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
34359 +#endif
34360 +#ifdef CONFIG_IA32_EMULATION                   
34361 +       syscall32_cpu_init ();
34362 +#endif
34363 +}
34364 +
34365 +void __cpuinit check_efer(void)
34366 +{
34367 +       unsigned long efer;
34368 +
34369 +       rdmsrl(MSR_EFER, efer); 
34370 +        if (!(efer & EFER_NX) || do_not_nx) { 
34371 +                __supported_pte_mask &= ~_PAGE_NX; 
34372 +        }       
34373 +}
34374 +
34375 +/*
34376 + * cpu_init() initializes state that is per-CPU. Some data is already
34377 + * initialized (naturally) in the bootstrap process, such as the GDT
34378 + * and IDT. We reload them nevertheless, this function acts as a
34379 + * 'CPU state barrier', nothing should get across.
34380 + * A lot of state is already set up in PDA init.
34381 + */
34382 +void __cpuinit cpu_init (void)
34383 +{
34384 +       int cpu = stack_smp_processor_id();
34385 +#ifndef CONFIG_X86_NO_TSS
34386 +       struct tss_struct *t = &per_cpu(init_tss, cpu);
34387 +       unsigned long v; 
34388 +       char *estacks = NULL; 
34389 +       unsigned i;
34390 +#endif
34391 +       struct task_struct *me;
34392 +
34393 +       /* CPU 0 is initialised in head64.c */
34394 +       if (cpu != 0) {
34395 +               pda_init(cpu);
34396 +               zap_low_mappings(cpu);
34397 +       }
34398 +#ifndef CONFIG_X86_NO_TSS
34399 +       else
34400 +               estacks = boot_exception_stacks; 
34401 +#endif
34402 +
34403 +       me = current;
34404 +
34405 +       if (cpu_test_and_set(cpu, cpu_initialized))
34406 +               panic("CPU#%d already initialized!\n", cpu);
34407 +
34408 +       printk("Initializing CPU#%d\n", cpu);
34409 +
34410 +       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
34411 +
34412 +       /*
34413 +        * Initialize the per-CPU GDT with the boot GDT,
34414 +        * and set up the GDT descriptor:
34415 +        */
34416 +#ifndef CONFIG_XEN 
34417 +       if (cpu)
34418 +               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
34419 +#endif
34420 +
34421 +       cpu_gdt_descr[cpu].size = GDT_SIZE;
34422 +       cpu_gdt_init(&cpu_gdt_descr[cpu]);
34423 +
34424 +       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
34425 +       syscall_init();
34426 +
34427 +       wrmsrl(MSR_FS_BASE, 0);
34428 +       wrmsrl(MSR_KERNEL_GS_BASE, 0);
34429 +       barrier(); 
34430 +
34431 +       check_efer();
34432 +
34433 +#ifndef CONFIG_X86_NO_TSS
34434 +       /*
34435 +        * set up and load the per-CPU TSS
34436 +        */
34437 +       for (v = 0; v < N_EXCEPTION_STACKS; v++) {
34438 +               if (cpu) {
34439 +                       static const unsigned int order[N_EXCEPTION_STACKS] = {
34440 +                               [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
34441 +                               [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
34442 +                       };
34443 +
34444 +                       estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
34445 +                       if (!estacks)
34446 +                               panic("Cannot allocate exception stack %ld %d\n",
34447 +                                     v, cpu); 
34448 +               }
34449 +               switch (v + 1) {
34450 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
34451 +               case DEBUG_STACK:
34452 +                       cpu_pda[cpu].debugstack = (unsigned long)estacks;
34453 +                       estacks += DEBUG_STKSZ;
34454 +                       break;
34455 +#endif
34456 +               default:
34457 +                       estacks += EXCEPTION_STKSZ;
34458 +                       break;
34459 +               }
34460 +               t->ist[v] = (unsigned long)estacks;
34461 +       }
34462 +
34463 +       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
34464 +       /*
34465 +        * <= is required because the CPU will access up to
34466 +        * 8 bits beyond the end of the IO permission bitmap.
34467 +        */
34468 +       for (i = 0; i <= IO_BITMAP_LONGS; i++)
34469 +               t->io_bitmap[i] = ~0UL;
34470 +#endif
34471 +
34472 +       atomic_inc(&init_mm.mm_count);
34473 +       me->active_mm = &init_mm;
34474 +       if (me->mm)
34475 +               BUG();
34476 +       enter_lazy_tlb(&init_mm, me);
34477 +
34478 +#ifndef CONFIG_X86_NO_TSS
34479 +       set_tss_desc(cpu, t);
34480 +#endif
34481 +#ifndef CONFIG_XEN
34482 +       load_TR_desc();
34483 +#endif
34484 +       load_LDT(&init_mm.context);
34485 +
34486 +       /*
34487 +        * Clear all 6 debug registers:
34488 +        */
34489 +
34490 +       set_debug(0UL, 0);
34491 +       set_debug(0UL, 1);
34492 +       set_debug(0UL, 2);
34493 +       set_debug(0UL, 3);
34494 +       set_debug(0UL, 6);
34495 +       set_debug(0UL, 7);
34496 +
34497 +       fpu_init(); 
34498 +}
34499 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/setup-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/setup-xen.c
34500 --- ref-linux-2.6.16.9/arch/x86_64/kernel/setup-xen.c   1970-01-01 01:00:00.000000000 +0100
34501 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/setup-xen.c      2006-04-10 00:05:52.000000000 +0200
34502 @@ -0,0 +1,1661 @@
34503 +/*
34504 + *  linux/arch/x86-64/kernel/setup.c
34505 + *
34506 + *  Copyright (C) 1995  Linus Torvalds
34507 + *
34508 + *  Nov 2001 Dave Jones <davej@suse.de>
34509 + *  Forked from i386 setup code.
34510 + *
34511 + *  $Id$
34512 + */
34513 +
34514 +/*
34515 + * This file handles the architecture-dependent parts of initialization
34516 + */
34517 +
34518 +#include <linux/errno.h>
34519 +#include <linux/sched.h>
34520 +#include <linux/kernel.h>
34521 +#include <linux/mm.h>
34522 +#include <linux/stddef.h>
34523 +#include <linux/unistd.h>
34524 +#include <linux/ptrace.h>
34525 +#include <linux/slab.h>
34526 +#include <linux/user.h>
34527 +#include <linux/a.out.h>
34528 +#include <linux/tty.h>
34529 +#include <linux/ioport.h>
34530 +#include <linux/delay.h>
34531 +#include <linux/config.h>
34532 +#include <linux/init.h>
34533 +#include <linux/initrd.h>
34534 +#include <linux/highmem.h>
34535 +#include <linux/bootmem.h>
34536 +#include <linux/module.h>
34537 +#include <asm/processor.h>
34538 +#include <linux/console.h>
34539 +#include <linux/seq_file.h>
34540 +#include <linux/crash_dump.h>
34541 +#include <linux/root_dev.h>
34542 +#include <linux/pci.h>
34543 +#include <linux/acpi.h>
34544 +#include <linux/kallsyms.h>
34545 +#include <linux/edd.h>
34546 +#include <linux/mmzone.h>
34547 +#include <linux/kexec.h>
34548 +#include <linux/cpufreq.h>
34549 +#include <linux/dmi.h>
34550 +#include <linux/dma-mapping.h>
34551 +
34552 +#include <asm/mtrr.h>
34553 +#include <asm/uaccess.h>
34554 +#include <asm/system.h>
34555 +#include <asm/io.h>
34556 +#include <asm/smp.h>
34557 +#include <asm/msr.h>
34558 +#include <asm/desc.h>
34559 +#include <video/edid.h>
34560 +#include <asm/e820.h>
34561 +#include <asm/dma.h>
34562 +#include <asm/mpspec.h>
34563 +#include <asm/mmu_context.h>
34564 +#include <asm/bootsetup.h>
34565 +#include <asm/proto.h>
34566 +#include <asm/setup.h>
34567 +#include <asm/mach_apic.h>
34568 +#include <asm/numa.h>
34569 +#include <asm/swiotlb.h>
34570 +#include <asm/sections.h>
34571 +#include <asm/gart-mapping.h>
34572 +#ifdef CONFIG_XEN
34573 +#include <linux/percpu.h>
34574 +#include <xen/interface/physdev.h>
34575 +#include "setup_arch_pre.h"
34576 +#include <asm/hypervisor.h>
34577 +#include <xen/interface/nmi.h>
34578 +#include <xen/features.h>
34579 +#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
34580 +#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
34581 +#define end_pfn_map end_pfn
34582 +#include <asm/mach-xen/setup_arch_post.h>
34583 +
34584 +extern unsigned long start_pfn;
34585 +extern struct edid_info edid_info;
34586 +
34587 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
34588 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
34589 +
34590 +extern char hypercall_page[PAGE_SIZE];
34591 +EXPORT_SYMBOL(hypercall_page);
34592 +
34593 +/* Allows setting of maximum possible memory size  */
34594 +unsigned long xen_override_max_pfn;
34595 +
34596 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
34597 +static struct notifier_block xen_panic_block = {
34598 +       xen_panic_event, NULL, 0 /* try to go last */
34599 +};
34600 +
34601 +unsigned long *phys_to_machine_mapping;
34602 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
34603 +
34604 +EXPORT_SYMBOL(phys_to_machine_mapping);
34605 +
34606 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
34607 +DEFINE_PER_CPU(int, nr_multicall_ents);
34608 +
34609 +/* Raw start-of-day parameters from the hypervisor. */
34610 +start_info_t *xen_start_info;
34611 +EXPORT_SYMBOL(xen_start_info);
34612 +#endif
34613 +
34614 +/*
34615 + * Machine setup..
34616 + */
34617 +
34618 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
34619 +
34620 +unsigned long mmu_cr4_features;
34621 +
34622 +int acpi_disabled;
34623 +EXPORT_SYMBOL(acpi_disabled);
34624 +#ifdef CONFIG_ACPI
34625 +extern int __initdata acpi_ht;
34626 +extern acpi_interrupt_flags    acpi_sci_flags;
34627 +int __initdata acpi_force = 0;
34628 +#endif
34629 +
34630 +int acpi_numa __initdata;
34631 +
34632 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
34633 +int bootloader_type;
34634 +
34635 +unsigned long saved_video_mode;
34636 +
34637 +/*
34638 + * Setup options
34639 + */
34640 +struct screen_info screen_info;
34641 +struct sys_desc_table_struct {
34642 +       unsigned short length;
34643 +       unsigned char table[0];
34644 +};
34645 +
34646 +struct edid_info edid_info;
34647 +struct e820map e820;
34648 +
34649 +extern int root_mountflags;
34650 +
34651 +char command_line[COMMAND_LINE_SIZE];
34652 +
34653 +struct resource standard_io_resources[] = {
34654 +       { .name = "dma1", .start = 0x00, .end = 0x1f,
34655 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34656 +       { .name = "pic1", .start = 0x20, .end = 0x21,
34657 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34658 +       { .name = "timer0", .start = 0x40, .end = 0x43,
34659 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34660 +       { .name = "timer1", .start = 0x50, .end = 0x53,
34661 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34662 +       { .name = "keyboard", .start = 0x60, .end = 0x6f,
34663 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34664 +       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
34665 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34666 +       { .name = "pic2", .start = 0xa0, .end = 0xa1,
34667 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34668 +       { .name = "dma2", .start = 0xc0, .end = 0xdf,
34669 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34670 +       { .name = "fpu", .start = 0xf0, .end = 0xff,
34671 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
34672 +};
34673 +
34674 +#define STANDARD_IO_RESOURCES \
34675 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
34676 +
34677 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
34678 +
34679 +struct resource data_resource = {
34680 +       .name = "Kernel data",
34681 +       .start = 0,
34682 +       .end = 0,
34683 +       .flags = IORESOURCE_RAM,
34684 +};
34685 +struct resource code_resource = {
34686 +       .name = "Kernel code",
34687 +       .start = 0,
34688 +       .end = 0,
34689 +       .flags = IORESOURCE_RAM,
34690 +};
34691 +
34692 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
34693 +
34694 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
34695 +static struct resource system_rom_resource = {
34696 +       .name = "System ROM",
34697 +       .start = 0xf0000,
34698 +       .end = 0xfffff,
34699 +       .flags = IORESOURCE_ROM,
34700 +};
34701 +
34702 +static struct resource extension_rom_resource = {
34703 +       .name = "Extension ROM",
34704 +       .start = 0xe0000,
34705 +       .end = 0xeffff,
34706 +       .flags = IORESOURCE_ROM,
34707 +};
34708 +
34709 +static struct resource adapter_rom_resources[] = {
34710 +       { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
34711 +               .flags = IORESOURCE_ROM },
34712 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34713 +               .flags = IORESOURCE_ROM },
34714 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34715 +               .flags = IORESOURCE_ROM },
34716 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34717 +               .flags = IORESOURCE_ROM },
34718 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34719 +               .flags = IORESOURCE_ROM },
34720 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34721 +               .flags = IORESOURCE_ROM }
34722 +};
34723 +#endif
34724 +
34725 +#define ADAPTER_ROM_RESOURCES \
34726 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
34727 +
34728 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
34729 +static struct resource video_rom_resource = {
34730 +       .name = "Video ROM",
34731 +       .start = 0xc0000,
34732 +       .end = 0xc7fff,
34733 +       .flags = IORESOURCE_ROM,
34734 +};
34735 +#endif
34736 +
34737 +static struct resource video_ram_resource = {
34738 +       .name = "Video RAM area",
34739 +       .start = 0xa0000,
34740 +       .end = 0xbffff,
34741 +       .flags = IORESOURCE_RAM,
34742 +};
34743 +
34744 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
34745 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
34746 +
34747 +static int __init romchecksum(unsigned char *rom, unsigned long length)
34748 +{
34749 +       unsigned char *p, sum = 0;
34750 +
34751 +       for (p = rom; p < rom + length; p++)
34752 +               sum += *p;
34753 +       return sum == 0;
34754 +}
34755 +
34756 +static void __init probe_roms(void)
34757 +{
34758 +       unsigned long start, length, upper;
34759 +       unsigned char *rom;
34760 +       int           i;
34761 +
34762 +       /* video rom */
34763 +       upper = adapter_rom_resources[0].start;
34764 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
34765 +               rom = isa_bus_to_virt(start);
34766 +               if (!romsignature(rom))
34767 +                       continue;
34768 +
34769 +               video_rom_resource.start = start;
34770 +
34771 +               /* 0 < length <= 0x7f * 512, historically */
34772 +               length = rom[2] * 512;
34773 +
34774 +               /* if checksum okay, trust length byte */
34775 +               if (length && romchecksum(rom, length))
34776 +                       video_rom_resource.end = start + length - 1;
34777 +
34778 +               request_resource(&iomem_resource, &video_rom_resource);
34779 +               break;
34780 +                       }
34781 +
34782 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
34783 +       if (start < upper)
34784 +               start = upper;
34785 +
34786 +       /* system rom */
34787 +       request_resource(&iomem_resource, &system_rom_resource);
34788 +       upper = system_rom_resource.start;
34789 +
34790 +       /* check for extension rom (ignore length byte!) */
34791 +       rom = isa_bus_to_virt(extension_rom_resource.start);
34792 +       if (romsignature(rom)) {
34793 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
34794 +               if (romchecksum(rom, length)) {
34795 +                       request_resource(&iomem_resource, &extension_rom_resource);
34796 +                       upper = extension_rom_resource.start;
34797 +               }
34798 +       }
34799 +
34800 +       /* check for adapter roms on 2k boundaries */
34801 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
34802 +               rom = isa_bus_to_virt(start);
34803 +               if (!romsignature(rom))
34804 +                       continue;
34805 +
34806 +               /* 0 < length <= 0x7f * 512, historically */
34807 +               length = rom[2] * 512;
34808 +
34809 +               /* but accept any length that fits if checksum okay */
34810 +               if (!length || start + length > upper || !romchecksum(rom, length))
34811 +                       continue;
34812 +
34813 +               adapter_rom_resources[i].start = start;
34814 +               adapter_rom_resources[i].end = start + length - 1;
34815 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
34816 +
34817 +               start = adapter_rom_resources[i++].end & ~2047UL;
34818 +       }
34819 +}
34820 +#endif
34821 +
34822 +static __init void parse_cmdline_early (char ** cmdline_p)
34823 +{
34824 +       char c = ' ', *to = command_line, *from = COMMAND_LINE;
34825 +       int len = 0;
34826 +       int userdef = 0;
34827 +
34828 +       for (;;) {
34829 +               if (c != ' ') 
34830 +                       goto next_char; 
34831 +
34832 +#ifdef  CONFIG_SMP
34833 +               /*
34834 +                * If the BIOS enumerates physical processors before logical,
34835 +                * maxcpus=N at enumeration-time can be used to disable HT.
34836 +                */
34837 +               else if (!memcmp(from, "maxcpus=", 8)) {
34838 +                       extern unsigned int maxcpus;
34839 +
34840 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
34841 +               }
34842 +#endif
34843 +#ifdef CONFIG_ACPI
34844 +               /* "acpi=off" disables both ACPI table parsing and interpreter init */
34845 +               if (!memcmp(from, "acpi=off", 8))
34846 +                       disable_acpi();
34847 +
34848 +               if (!memcmp(from, "acpi=force", 10)) { 
34849 +                       /* add later when we do DMI horrors: */
34850 +                       acpi_force = 1;
34851 +                       acpi_disabled = 0;
34852 +               }
34853 +
34854 +               /* acpi=ht just means: do ACPI MADT parsing 
34855 +                  at bootup, but don't enable the full ACPI interpreter */
34856 +               if (!memcmp(from, "acpi=ht", 7)) { 
34857 +                       if (!acpi_force)
34858 +                               disable_acpi();
34859 +                       acpi_ht = 1; 
34860 +               }
34861 +                else if (!memcmp(from, "pci=noacpi", 10)) 
34862 +                       acpi_disable_pci();
34863 +               else if (!memcmp(from, "acpi=noirq", 10))
34864 +                       acpi_noirq_set();
34865 +
34866 +               else if (!memcmp(from, "acpi_sci=edge", 13))
34867 +                       acpi_sci_flags.trigger =  1;
34868 +               else if (!memcmp(from, "acpi_sci=level", 14))
34869 +                       acpi_sci_flags.trigger = 3;
34870 +               else if (!memcmp(from, "acpi_sci=high", 13))
34871 +                       acpi_sci_flags.polarity = 1;
34872 +               else if (!memcmp(from, "acpi_sci=low", 12))
34873 +                       acpi_sci_flags.polarity = 3;
34874 +
34875 +               /* acpi=strict disables out-of-spec workarounds */
34876 +               else if (!memcmp(from, "acpi=strict", 11)) {
34877 +                       acpi_strict = 1;
34878 +               }
34879 +#ifdef CONFIG_X86_IO_APIC
34880 +               else if (!memcmp(from, "acpi_skip_timer_override", 24))
34881 +                       acpi_skip_timer_override = 1;
34882 +#endif
34883 +#endif
34884 +
34885 +#ifndef CONFIG_XEN
34886 +               if (!memcmp(from, "nolapic", 7) ||
34887 +                   !memcmp(from, "disableapic", 11))
34888 +                       disable_apic = 1;
34889 +
34890 +               /* Don't confuse with noapictimer */
34891 +               if (!memcmp(from, "noapic", 6) &&
34892 +                       (from[6] == ' ' || from[6] == 0))
34893 +                       skip_ioapic_setup = 1;
34894 +
34895 +               /* Make sure to not confuse with apic= */
34896 +               if (!memcmp(from, "apic", 4) &&
34897 +                       (from[4] == ' ' || from[4] == 0)) {
34898 +                       skip_ioapic_setup = 0;
34899 +                       ioapic_force = 1;
34900 +               }
34901 +#endif
34902 +                       
34903 +               if (!memcmp(from, "mem=", 4))
34904 +                       parse_memopt(from+4, &from); 
34905 +
34906 +               if (!memcmp(from, "memmap=", 7)) {
34907 +                       /* exactmap option is for used defined memory */
34908 +                       if (!memcmp(from+7, "exactmap", 8)) {
34909 +#ifdef CONFIG_CRASH_DUMP
34910 +                               /* If we are doing a crash dump, we
34911 +                                * still need to know the real mem
34912 +                                * size before original memory map is
34913 +                                * reset.
34914 +                                */
34915 +                               saved_max_pfn = e820_end_of_ram();
34916 +#endif
34917 +                               from += 8+7;
34918 +                               end_pfn_map = 0;
34919 +                               e820.nr_map = 0;
34920 +                               userdef = 1;
34921 +                       }
34922 +                       else {
34923 +                               parse_memmapopt(from+7, &from);
34924 +                               userdef = 1;
34925 +                       }
34926 +               }
34927 +
34928 +#ifdef CONFIG_NUMA
34929 +               if (!memcmp(from, "numa=", 5))
34930 +                       numa_setup(from+5); 
34931 +#endif
34932 +
34933 +               if (!memcmp(from,"iommu=",6)) { 
34934 +                       iommu_setup(from+6); 
34935 +               }
34936 +
34937 +               if (!memcmp(from,"oops=panic", 10))
34938 +                       panic_on_oops = 1;
34939 +
34940 +               if (!memcmp(from, "noexec=", 7))
34941 +                       nonx_setup(from + 7);
34942 +
34943 +#ifdef CONFIG_KEXEC
34944 +               /* crashkernel=size@addr specifies the location to reserve for
34945 +                * a crash kernel.  By reserving this memory we guarantee
34946 +                * that linux never set's it up as a DMA target.
34947 +                * Useful for holding code to do something appropriate
34948 +                * after a kernel panic.
34949 +                */
34950 +               else if (!memcmp(from, "crashkernel=", 12)) {
34951 +                       unsigned long size, base;
34952 +                       size = memparse(from+12, &from);
34953 +                       if (*from == '@') {
34954 +                               base = memparse(from+1, &from);
34955 +                               /* FIXME: Do I want a sanity check
34956 +                                * to validate the memory range?
34957 +                                */
34958 +                               crashk_res.start = base;
34959 +                               crashk_res.end   = base + size - 1;
34960 +                       }
34961 +               }
34962 +#endif
34963 +
34964 +#ifdef CONFIG_PROC_VMCORE
34965 +               /* elfcorehdr= specifies the location of elf core header
34966 +                * stored by the crashed kernel. This option will be passed
34967 +                * by kexec loader to the capture kernel.
34968 +                */
34969 +               else if(!memcmp(from, "elfcorehdr=", 11))
34970 +                       elfcorehdr_addr = memparse(from+11, &from);
34971 +#endif
34972 +
34973 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
34974 +               else if (!memcmp(from, "additional_cpus=", 16))
34975 +                       setup_additional_cpus(from+16);
34976 +#endif
34977 +
34978 +       next_char:
34979 +               c = *(from++);
34980 +               if (!c)
34981 +                       break;
34982 +               if (COMMAND_LINE_SIZE <= ++len)
34983 +                       break;
34984 +               *(to++) = c;
34985 +       }
34986 +       if (userdef) {
34987 +               printk(KERN_INFO "user-defined physical RAM map:\n");
34988 +               e820_print_map("user");
34989 +       }
34990 +       *to = '\0';
34991 +       *cmdline_p = command_line;
34992 +}
34993 +
34994 +#ifndef CONFIG_NUMA
34995 +#ifdef CONFIG_XEN
34996 +static void __init
34997 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
34998 +{
34999 +        unsigned long bootmap_size;
35000 +
35001 +        bootmap_size = init_bootmem(start_pfn, end_pfn);
35002 +        free_bootmem(0, xen_start_info->nr_pages << PAGE_SHIFT);   
35003 +        reserve_bootmem(HIGH_MEMORY,
35004 +                        (PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1)
35005 +                        - HIGH_MEMORY);
35006 +}
35007 +#else
35008 +static void __init
35009 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
35010 +{
35011 +       unsigned long bootmap_size, bootmap;
35012 +
35013 +       bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
35014 +       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
35015 +       if (bootmap == -1L)
35016 +               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
35017 +       bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
35018 +       e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
35019 +       reserve_bootmem(bootmap, bootmap_size);
35020 +} 
35021 +#endif /* !CONFIG_XEN */
35022 +#endif
35023 +
35024 +/* Use inline assembly to define this because the nops are defined 
35025 +   as inline assembly strings in the include files and we cannot 
35026 +   get them easily into strings. */
35027 +asm("\t.data\nk8nops: " 
35028 +    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
35029 +    K8_NOP7 K8_NOP8); 
35030 +    
35031 +extern unsigned char k8nops[];
35032 +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
35033 +     NULL,
35034 +     k8nops,
35035 +     k8nops + 1,
35036 +     k8nops + 1 + 2,
35037 +     k8nops + 1 + 2 + 3,
35038 +     k8nops + 1 + 2 + 3 + 4,
35039 +     k8nops + 1 + 2 + 3 + 4 + 5,
35040 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
35041 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
35042 +}; 
35043 +
35044 +extern char __vsyscall_0;
35045 +
35046 +/* Replace instructions with better alternatives for this CPU type.
35047 +
35048 +   This runs before SMP is initialized to avoid SMP problems with
35049 +   self modifying code. This implies that assymetric systems where
35050 +   APs have less capabilities than the boot processor are not handled. 
35051 +   In this case boot with "noreplacement". */ 
35052 +void apply_alternatives(void *start, void *end) 
35053 +{ 
35054 +       struct alt_instr *a; 
35055 +       int diff, i, k;
35056 +       for (a = start; (void *)a < end; a++) { 
35057 +               u8 *instr;
35058 +
35059 +               if (!boot_cpu_has(a->cpuid))
35060 +                       continue;
35061 +
35062 +               BUG_ON(a->replacementlen > a->instrlen); 
35063 +               instr = a->instr;
35064 +               /* vsyscall code is not mapped yet. resolve it manually. */
35065 +               if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
35066 +                       instr -= VSYSCALL_START - (unsigned long)&__vsyscall_0;
35067 +               __inline_memcpy(instr, a->replacement, a->replacementlen);
35068 +               diff = a->instrlen - a->replacementlen; 
35069 +
35070 +               /* Pad the rest with nops */
35071 +               for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
35072 +                       k = diff;
35073 +                       if (k > ASM_NOP_MAX)
35074 +                               k = ASM_NOP_MAX;
35075 +                       __inline_memcpy(instr + i, k8_nops[k], k);
35076 +               } 
35077 +       }
35078 +} 
35079 +
35080 +static int no_replacement __initdata = 0; 
35081
35082 +void __init alternative_instructions(void)
35083 +{
35084 +       extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
35085 +       if (no_replacement) 
35086 +               return;
35087 +       apply_alternatives(__alt_instructions, __alt_instructions_end);
35088 +}
35089 +
35090 +static int __init noreplacement_setup(char *s)
35091 +{ 
35092 +     no_replacement = 1; 
35093 +     return 0; 
35094 +} 
35095 +
35096 +__setup("noreplacement", noreplacement_setup); 
35097 +
35098 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
35099 +struct edd edd;
35100 +#ifdef CONFIG_EDD_MODULE
35101 +EXPORT_SYMBOL(edd);
35102 +#endif
35103 +/**
35104 + * copy_edd() - Copy the BIOS EDD information
35105 + *              from boot_params into a safe place.
35106 + *
35107 + */
35108 +static inline void copy_edd(void)
35109 +{
35110 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
35111 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
35112 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
35113 +     edd.edd_info_nr = EDD_NR;
35114 +}
35115 +#else
35116 +static inline void copy_edd(void)
35117 +{
35118 +}
35119 +#endif
35120 +
35121 +#ifndef CONFIG_XEN
35122 +#define EBDA_ADDR_POINTER 0x40E
35123 +static void __init reserve_ebda_region(void)
35124 +{
35125 +       unsigned int addr;
35126 +       /** 
35127 +        * there is a real-mode segmented pointer pointing to the 
35128 +        * 4K EBDA area at 0x40E
35129 +        */
35130 +       addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
35131 +       addr <<= 4;
35132 +       if (addr)
35133 +               reserve_bootmem_generic(addr, PAGE_SIZE);
35134 +}
35135 +#endif
35136 +
35137 +void __init setup_arch(char **cmdline_p)
35138 +{
35139 +       unsigned long kernel_end;
35140 +
35141 +#ifdef CONFIG_XEN
35142 +       /* Register a call for panic conditions. */
35143 +       notifier_chain_register(&panic_notifier_list, &xen_panic_block);
35144 +
35145 +       ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); 
35146 +       kernel_end = 0;         /* dummy */
35147 +       screen_info = SCREEN_INFO;
35148 +
35149 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
35150 +               /* This is drawn from a dump from vgacon:startup in
35151 +                * standard Linux. */
35152 +               screen_info.orig_video_mode = 3;
35153 +               screen_info.orig_video_isVGA = 1;
35154 +               screen_info.orig_video_lines = 25;
35155 +               screen_info.orig_video_cols = 80;
35156 +               screen_info.orig_video_ega_bx = 3;
35157 +               screen_info.orig_video_points = 16;
35158 +       } else
35159 +               screen_info.orig_video_isVGA = 0;
35160 +
35161 +       edid_info = EDID_INFO;
35162 +       saved_video_mode = SAVED_VIDEO_MODE;
35163 +       bootloader_type = LOADER_TYPE;
35164 +
35165 +#ifdef CONFIG_BLK_DEV_RAM
35166 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
35167 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
35168 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
35169 +
35170 +
35171 +#endif
35172 +
35173 +       setup_xen_features();
35174 +
35175 +       if (xen_feature(XENFEAT_auto_translated_physmap) &&
35176 +           xen_start_info->shared_info < xen_start_info->nr_pages) {
35177 +               HYPERVISOR_shared_info =
35178 +                       (shared_info_t *)__va(xen_start_info->shared_info);
35179 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
35180 +       }
35181 +
35182 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
35183 +                            VMASST_TYPE_writable_pagetables);
35184 +
35185 +       ARCH_SETUP
35186 +#else
35187 +       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
35188 +       screen_info = SCREEN_INFO;
35189 +       edid_info = EDID_INFO;
35190 +       saved_video_mode = SAVED_VIDEO_MODE;
35191 +       bootloader_type = LOADER_TYPE;
35192 +
35193 +#ifdef CONFIG_BLK_DEV_RAM
35194 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
35195 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
35196 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
35197 +#endif
35198 +       setup_memory_region();
35199 +       copy_edd();
35200 +#endif /* !CONFIG_XEN */
35201 +
35202 +       if (!MOUNT_ROOT_RDONLY)
35203 +               root_mountflags &= ~MS_RDONLY;
35204 +       init_mm.start_code = (unsigned long) &_text;
35205 +       init_mm.end_code = (unsigned long) &_etext;
35206 +       init_mm.end_data = (unsigned long) &_edata;
35207 +#ifdef CONFIG_XEN
35208 +       init_mm.brk = start_pfn << PAGE_SHIFT;
35209 +#else
35210 +       init_mm.brk = (unsigned long) &_end;    
35211 +
35212 +       code_resource.start = virt_to_phys(&_text);
35213 +       code_resource.end = virt_to_phys(&_etext)-1;
35214 +       data_resource.start = virt_to_phys(&_etext);
35215 +       data_resource.end = virt_to_phys(&_edata)-1;
35216 +#endif
35217 +
35218 +       parse_cmdline_early(cmdline_p);
35219 +
35220 +       early_identify_cpu(&boot_cpu_data);
35221 +
35222 +       /*
35223 +        * partially used pages are not usable - thus
35224 +        * we are rounding upwards:
35225 +        */
35226 +       end_pfn = e820_end_of_ram();
35227 +
35228 +       check_efer();
35229 +
35230 +       init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
35231 +
35232 +#ifdef CONFIG_ACPI_NUMA
35233 +       /*
35234 +        * Parse SRAT to discover nodes.
35235 +        */
35236 +       acpi_numa_init();
35237 +#endif
35238 +
35239 +#ifdef CONFIG_NUMA
35240 +       numa_initmem_init(start_pfn, end_pfn); 
35241 +#else
35242 +       contig_initmem_init(start_pfn, end_pfn);
35243 +#endif
35244 +
35245 +#ifndef CONFIG_XEN
35246 +       /* Reserve direct mapping */
35247 +       reserve_bootmem_generic(table_start << PAGE_SHIFT, 
35248 +                               (table_end - table_start) << PAGE_SHIFT);
35249 +
35250 +       /* reserve kernel */
35251 +       kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
35252 +       reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
35253 +
35254 +       /*
35255 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
35256 +        * enabling clean reboots, SMP operation, laptop functions.
35257 +        */
35258 +       reserve_bootmem_generic(0, PAGE_SIZE);
35259 +
35260 +       /* reserve ebda region */
35261 +       reserve_ebda_region();
35262 +#endif
35263 +
35264 +#ifdef CONFIG_SMP
35265 +       /*
35266 +        * But first pinch a few for the stack/trampoline stuff
35267 +        * FIXME: Don't need the extra page at 4K, but need to fix
35268 +        * trampoline before removing it. (see the GDT stuff)
35269 +        */
35270 +       reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
35271 +
35272 +       /* Reserve SMP trampoline */
35273 +       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
35274 +#endif
35275 +
35276 +#ifdef CONFIG_ACPI_SLEEP
35277 +       /*
35278 +        * Reserve low memory region for sleep support.
35279 +        */
35280 +       acpi_reserve_bootmem();
35281 +#endif
35282 +#ifdef CONFIG_XEN
35283 +#ifdef CONFIG_BLK_DEV_INITRD
35284 +       if (xen_start_info->mod_start) {
35285 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
35286 +                       /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
35287 +                       initrd_start = INITRD_START + PAGE_OFFSET;
35288 +                       initrd_end = initrd_start+INITRD_SIZE;
35289 +                       initrd_below_start_ok = 1;
35290 +               } else {
35291 +                       printk(KERN_ERR "initrd extends beyond end of memory "
35292 +                               "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
35293 +                               (unsigned long)(INITRD_START + INITRD_SIZE),
35294 +                               (unsigned long)(end_pfn << PAGE_SHIFT));
35295 +                       initrd_start = 0;
35296 +               }
35297 +       }
35298 +#endif
35299 +#else  /* CONFIG_XEN */
35300 +#ifdef CONFIG_BLK_DEV_INITRD
35301 +       if (LOADER_TYPE && INITRD_START) {
35302 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
35303 +                       reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
35304 +                       initrd_start =
35305 +                               INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
35306 +                       initrd_end = initrd_start+INITRD_SIZE;
35307 +               }
35308 +               else {
35309 +                       printk(KERN_ERR "initrd extends beyond end of memory "
35310 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
35311 +                           (unsigned long)(INITRD_START + INITRD_SIZE),
35312 +                           (unsigned long)(end_pfn << PAGE_SHIFT));
35313 +                       initrd_start = 0;
35314 +               }
35315 +       }
35316 +#endif
35317 +#endif /* !CONFIG_XEN */
35318 +#ifdef CONFIG_KEXEC
35319 +       if (crashk_res.start != crashk_res.end) {
35320 +               reserve_bootmem(crashk_res.start,
35321 +                       crashk_res.end - crashk_res.start + 1);
35322 +       }
35323 +#endif
35324 +
35325 +       paging_init();
35326 +#ifdef CONFIG_X86_LOCAL_APIC
35327 +       /*
35328 +        * Find and reserve possible boot-time SMP configuration:
35329 +        */
35330 +       find_smp_config();
35331 +#endif
35332 +#ifdef CONFIG_XEN
35333 +       {
35334 +               int i, j, k, fpp;
35335 +               unsigned long va;
35336 +
35337 +               /* 'Initial mapping' of initrd must be destroyed. */
35338 +               for (va = xen_start_info->mod_start;
35339 +                    va < (xen_start_info->mod_start+xen_start_info->mod_len);
35340 +                    va += PAGE_SIZE) {
35341 +                       HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
35342 +               }
35343 +
35344 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
35345 +                       /* Make sure we have a large enough P->M table. */
35346 +                       phys_to_machine_mapping = alloc_bootmem(
35347 +                               end_pfn * sizeof(unsigned long));
35348 +                       memset(phys_to_machine_mapping, ~0,
35349 +                              end_pfn * sizeof(unsigned long));
35350 +                       memcpy(phys_to_machine_mapping,
35351 +                              (unsigned long *)xen_start_info->mfn_list,
35352 +                              xen_start_info->nr_pages * sizeof(unsigned long));
35353 +                       free_bootmem(
35354 +                               __pa(xen_start_info->mfn_list),
35355 +                               PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
35356 +                                               sizeof(unsigned long))));
35357 +
35358 +                       /* Destroyed 'initial mapping' of old p2m table. */
35359 +                       for (va = xen_start_info->mfn_list;
35360 +                            va < (xen_start_info->mfn_list +
35361 +                                  (xen_start_info->nr_pages*sizeof(unsigned long)));
35362 +                            va += PAGE_SIZE) {
35363 +                               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
35364 +                       }
35365 +
35366 +                       /*
35367 +                        * Initialise the list of the frames that specify the
35368 +                        * list of frames that make up the p2m table. Used by
35369 +                         * save/restore.
35370 +                        */
35371 +                       pfn_to_mfn_frame_list_list = alloc_bootmem(PAGE_SIZE);
35372 +                       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
35373 +                               virt_to_mfn(pfn_to_mfn_frame_list_list);
35374 +
35375 +                       fpp = PAGE_SIZE/sizeof(unsigned long);
35376 +                       for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
35377 +                               if ((j % fpp) == 0) {
35378 +                                       k++;
35379 +                                       BUG_ON(k>=fpp);
35380 +                                       pfn_to_mfn_frame_list[k] =
35381 +                                               alloc_bootmem(PAGE_SIZE);
35382 +                                       pfn_to_mfn_frame_list_list[k] =
35383 +                                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
35384 +                                       j=0;
35385 +                               }
35386 +                               pfn_to_mfn_frame_list[k][j] =
35387 +                                       virt_to_mfn(&phys_to_machine_mapping[i]);
35388 +                       }
35389 +                       HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
35390 +               }
35391 +
35392 +       }
35393 +
35394 +       if ( ! (xen_start_info->flags & SIF_INITDOMAIN))
35395 +       {
35396 +               acpi_disabled = 1;
35397 +#ifdef  CONFIG_ACPI
35398 +               acpi_ht = 0;
35399 +#endif
35400 +       }
35401 +#endif
35402 +
35403 +#ifndef CONFIG_XEN
35404 +       check_ioapic();
35405 +#endif
35406 +
35407 +       zap_low_mappings(0);
35408 +
35409 +#ifdef CONFIG_ACPI
35410 +       /*
35411 +        * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
35412 +        * Call this early for SRAT node setup.
35413 +        */
35414 +       acpi_boot_table_init();
35415 +
35416 +       /*
35417 +        * Read APIC and some other early information from ACPI tables.
35418 +        */
35419 +       acpi_boot_init();
35420 +#endif
35421 +
35422 +       init_cpu_to_node();
35423 +
35424 +#ifdef CONFIG_X86_LOCAL_APIC
35425 +       /*
35426 +        * get boot-time SMP configuration:
35427 +        */
35428 +       if (smp_found_config)
35429 +               get_smp_config();
35430 +#ifndef CONFIG_XEN
35431 +       init_apic_mappings();
35432 +#endif
35433 +#endif
35434 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
35435 +       prefill_possible_map();
35436 +#endif
35437 +
35438 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
35439 +       /*
35440 +        * Request address space for all standard RAM and ROM resources
35441 +        * and also for regions reported as reserved by the e820.
35442 +        */
35443 +       probe_roms();
35444 +       e820_reserve_resources(); 
35445 +#endif
35446 +
35447 +       request_resource(&iomem_resource, &video_ram_resource);
35448 +
35449 +       {
35450 +       unsigned i;
35451 +       /* request I/O space for devices used on all i[345]86 PCs */
35452 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
35453 +               request_resource(&ioport_resource, &standard_io_resources[i]);
35454 +       }
35455 +
35456 +       e820_setup_gap();
35457 +
35458 +#ifdef CONFIG_GART_IOMMU
35459 +       iommu_hole_init();
35460 +#endif
35461 +
35462 +#ifdef CONFIG_XEN
35463 +       {
35464 +               physdev_op_t op;
35465 +
35466 +               op.cmd             = PHYSDEVOP_SET_IOPL;
35467 +               op.u.set_iopl.iopl = 1;
35468 +               HYPERVISOR_physdev_op(&op);
35469 +
35470 +               if (xen_start_info->flags & SIF_INITDOMAIN) {
35471 +                       if (!(xen_start_info->flags & SIF_PRIVILEGED))
35472 +                               panic("Xen granted us console access "
35473 +                                     "but not privileged status");
35474 +                      
35475 +#ifdef CONFIG_VT
35476 +#if defined(CONFIG_VGA_CONSOLE)
35477 +                       conswitchp = &vga_con;
35478 +#elif defined(CONFIG_DUMMY_CONSOLE)
35479 +                       conswitchp = &dummy_con;
35480 +#endif
35481 +#endif
35482 +               } else {
35483 +                       extern int console_use_vt;
35484 +                       console_use_vt = 0;
35485 +               }
35486 +       }
35487 +#else  /* CONFIG_XEN */
35488 +
35489 +#ifdef CONFIG_VT
35490 +#if defined(CONFIG_VGA_CONSOLE)
35491 +       conswitchp = &vga_con;
35492 +#elif defined(CONFIG_DUMMY_CONSOLE)
35493 +       conswitchp = &dummy_con;
35494 +#endif
35495 +#endif
35496 +
35497 +#endif /* !CONFIG_XEN */
35498 +}
35499 +
35500 +#ifdef CONFIG_XEN
35501 +static int
35502 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
35503 +{
35504 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
35505 +       /* we're never actually going to get here... */
35506 +       return NOTIFY_DONE;
35507 +}
35508 +#endif /* !CONFIG_XEN */
35509 +
35510 +
35511 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
35512 +{
35513 +       unsigned int *v;
35514 +
35515 +       if (c->extended_cpuid_level < 0x80000004)
35516 +               return 0;
35517 +
35518 +       v = (unsigned int *) c->x86_model_id;
35519 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
35520 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
35521 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
35522 +       c->x86_model_id[48] = 0;
35523 +       return 1;
35524 +}
35525 +
35526 +
35527 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
35528 +{
35529 +       unsigned int n, dummy, eax, ebx, ecx, edx;
35530 +
35531 +       n = c->extended_cpuid_level;
35532 +
35533 +       if (n >= 0x80000005) {
35534 +               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
35535 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
35536 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
35537 +               c->x86_cache_size=(ecx>>24)+(edx>>24);
35538 +               /* On K8 L1 TLB is inclusive, so don't count it */
35539 +               c->x86_tlbsize = 0;
35540 +       }
35541 +
35542 +       if (n >= 0x80000006) {
35543 +               cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
35544 +               ecx = cpuid_ecx(0x80000006);
35545 +               c->x86_cache_size = ecx >> 16;
35546 +               c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
35547 +
35548 +               printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
35549 +               c->x86_cache_size, ecx & 0xFF);
35550 +       }
35551 +
35552 +       if (n >= 0x80000007)
35553 +               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
35554 +       if (n >= 0x80000008) {
35555 +               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
35556 +               c->x86_virt_bits = (eax >> 8) & 0xff;
35557 +               c->x86_phys_bits = eax & 0xff;
35558 +       }
35559 +}
35560 +
35561 +#ifdef CONFIG_NUMA
35562 +static int nearby_node(int apicid)
35563 +{
35564 +       int i;
35565 +       for (i = apicid - 1; i >= 0; i--) {
35566 +               int node = apicid_to_node[i];
35567 +               if (node != NUMA_NO_NODE && node_online(node))
35568 +                       return node;
35569 +       }
35570 +       for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
35571 +               int node = apicid_to_node[i];
35572 +               if (node != NUMA_NO_NODE && node_online(node))
35573 +                       return node;
35574 +       }
35575 +       return first_node(node_online_map); /* Shouldn't happen */
35576 +}
35577 +#endif
35578 +
35579 +/*
35580 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35581 + * Assumes number of cores is a power of two.
35582 + */
35583 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
35584 +{
35585 +#ifdef CONFIG_SMP
35586 +       int cpu = smp_processor_id();
35587 +       unsigned bits;
35588 +#ifdef CONFIG_NUMA
35589 +       int node = 0;
35590 +       unsigned apicid = phys_proc_id[cpu];
35591 +#endif
35592 +
35593 +       bits = 0;
35594 +       while ((1 << bits) < c->x86_max_cores)
35595 +               bits++;
35596 +
35597 +       /* Low order bits define the core id (index of core in socket) */
35598 +       cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
35599 +       /* Convert the APIC ID into the socket ID */
35600 +       phys_proc_id[cpu] >>= bits;
35601 +
35602 +#ifdef CONFIG_NUMA
35603 +       node = phys_proc_id[cpu];
35604 +       if (apicid_to_node[apicid] != NUMA_NO_NODE)
35605 +               node = apicid_to_node[apicid];
35606 +       if (!node_online(node)) {
35607 +               /* Two possibilities here:
35608 +                  - The CPU is missing memory and no node was created.
35609 +                  In that case try picking one from a nearby CPU
35610 +                  - The APIC IDs differ from the HyperTransport node IDs
35611 +                  which the K8 northbridge parsing fills in.
35612 +                  Assume they are all increased by a constant offset,
35613 +                  but in the same order as the HT nodeids.
35614 +                  If that doesn't result in a usable node fall back to the
35615 +                  path for the previous case.  */
35616 +               int ht_nodeid = apicid - (phys_proc_id[0] << bits);
35617 +               if (ht_nodeid >= 0 &&
35618 +                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
35619 +                       node = apicid_to_node[ht_nodeid];
35620 +               /* Pick a nearby node */
35621 +               if (!node_online(node))
35622 +                       node = nearby_node(apicid);
35623 +       }
35624 +       numa_set_node(cpu, node);
35625 +
35626 +       printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
35627 +                       cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
35628 +#endif
35629 +#endif
35630 +}
35631 +
35632 +static int __init init_amd(struct cpuinfo_x86 *c)
35633 +{
35634 +       int r;
35635 +       unsigned level;
35636 +
35637 +#ifdef CONFIG_SMP
35638 +       unsigned long value;
35639 +
35640 +       /*
35641 +        * Disable TLB flush filter by setting HWCR.FFDIS on K8
35642 +        * bit 6 of msr C001_0015
35643 +        *
35644 +        * Errata 63 for SH-B3 steppings
35645 +        * Errata 122 for all steppings (F+ have it disabled by default)
35646 +        */
35647 +       if (c->x86 == 15) {
35648 +               rdmsrl(MSR_K8_HWCR, value);
35649 +               value |= 1 << 6;
35650 +               wrmsrl(MSR_K8_HWCR, value);
35651 +       }
35652 +#endif
35653 +
35654 +       /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
35655 +          3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
35656 +       clear_bit(0*32+31, &c->x86_capability);
35657 +       
35658 +       /* On C+ stepping K8 rep microcode works well for copy/memset */
35659 +       level = cpuid_eax(1);
35660 +       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
35661 +               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
35662 +
35663 +       r = get_model_name(c);
35664 +       if (!r) { 
35665 +               switch (c->x86) { 
35666 +               case 15:
35667 +                       /* Should distinguish Models here, but this is only
35668 +                          a fallback anyways. */
35669 +                       strcpy(c->x86_model_id, "Hammer");
35670 +                       break; 
35671 +               } 
35672 +       } 
35673 +       display_cacheinfo(c);
35674 +
35675 +       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
35676 +       if (c->x86_power & (1<<8))
35677 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
35678 +
35679 +       if (c->extended_cpuid_level >= 0x80000008) {
35680 +               c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
35681 +               if (c->x86_max_cores & (c->x86_max_cores - 1))
35682 +                       c->x86_max_cores = 1;
35683 +
35684 +               amd_detect_cmp(c);
35685 +       }
35686 +
35687 +       return r;
35688 +}
35689 +
35690 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
35691 +{
35692 +#ifdef CONFIG_SMP
35693 +       u32     eax, ebx, ecx, edx;
35694 +       int     index_msb, core_bits;
35695 +       int     cpu = smp_processor_id();
35696 +
35697 +       cpuid(1, &eax, &ebx, &ecx, &edx);
35698 +
35699 +       c->apicid = phys_pkg_id(0);
35700 +
35701 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
35702 +               return;
35703 +
35704 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
35705 +
35706 +       if (smp_num_siblings == 1) {
35707 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
35708 +       } else if (smp_num_siblings > 1 ) {
35709 +
35710 +               if (smp_num_siblings > NR_CPUS) {
35711 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
35712 +                       smp_num_siblings = 1;
35713 +                       return;
35714 +               }
35715 +
35716 +               index_msb = get_count_order(smp_num_siblings);
35717 +               phys_proc_id[cpu] = phys_pkg_id(index_msb);
35718 +
35719 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
35720 +                      phys_proc_id[cpu]);
35721 +
35722 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
35723 +
35724 +               index_msb = get_count_order(smp_num_siblings) ;
35725 +
35726 +               core_bits = get_count_order(c->x86_max_cores);
35727 +
35728 +               cpu_core_id[cpu] = phys_pkg_id(index_msb) &
35729 +                                              ((1 << core_bits) - 1);
35730 +
35731 +               if (c->x86_max_cores > 1)
35732 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
35733 +                              cpu_core_id[cpu]);
35734 +       }
35735 +#endif
35736 +}
35737 +
35738 +/*
35739 + * find out the number of processor cores on the die
35740 + */
35741 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
35742 +{
35743 +       unsigned int eax;
35744 +
35745 +       if (c->cpuid_level < 4)
35746 +               return 1;
35747 +
35748 +       __asm__("cpuid"
35749 +               : "=a" (eax)
35750 +               : "0" (4), "c" (0)
35751 +               : "bx", "dx");
35752 +
35753 +       if (eax & 0x1f)
35754 +               return ((eax >> 26) + 1);
35755 +       else
35756 +               return 1;
35757 +}
35758 +
35759 +static void srat_detect_node(void)
35760 +{
35761 +#ifdef CONFIG_NUMA
35762 +       unsigned node;
35763 +       int cpu = smp_processor_id();
35764 +
35765 +       /* Don't do the funky fallback heuristics the AMD version employs
35766 +          for now. */
35767 +       node = apicid_to_node[hard_smp_processor_id()];
35768 +       if (node == NUMA_NO_NODE)
35769 +               node = 0;
35770 +       numa_set_node(cpu, node);
35771 +
35772 +       if (acpi_numa > 0)
35773 +               printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
35774 +#endif
35775 +}
35776 +
35777 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
35778 +{
35779 +       /* Cache sizes */
35780 +       unsigned n;
35781 +
35782 +       init_intel_cacheinfo(c);
35783 +       n = c->extended_cpuid_level;
35784 +       if (n >= 0x80000008) {
35785 +               unsigned eax = cpuid_eax(0x80000008);
35786 +               c->x86_virt_bits = (eax >> 8) & 0xff;
35787 +               c->x86_phys_bits = eax & 0xff;
35788 +               /* CPUID workaround for Intel 0F34 CPU */
35789 +               if (c->x86_vendor == X86_VENDOR_INTEL &&
35790 +                   c->x86 == 0xF && c->x86_model == 0x3 &&
35791 +                   c->x86_mask == 0x4)
35792 +                       c->x86_phys_bits = 36;
35793 +       }
35794 +
35795 +       if (c->x86 == 15)
35796 +               c->x86_cache_alignment = c->x86_clflush_size * 2;
35797 +       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
35798 +           (c->x86 == 0x6 && c->x86_model >= 0x0e))
35799 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
35800 +       set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
35801 +       c->x86_max_cores = intel_num_cpu_cores(c);
35802 +
35803 +       srat_detect_node();
35804 +}
35805 +
35806 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
35807 +{
35808 +       char *v = c->x86_vendor_id;
35809 +
35810 +       if (!strcmp(v, "AuthenticAMD"))
35811 +               c->x86_vendor = X86_VENDOR_AMD;
35812 +       else if (!strcmp(v, "GenuineIntel"))
35813 +               c->x86_vendor = X86_VENDOR_INTEL;
35814 +       else
35815 +               c->x86_vendor = X86_VENDOR_UNKNOWN;
35816 +}
35817 +
35818 +struct cpu_model_info {
35819 +       int vendor;
35820 +       int family;
35821 +       char *model_names[16];
35822 +};
35823 +
35824 +/* Do some early cpuid on the boot CPU to get some parameter that are
35825 +   needed before check_bugs. Everything advanced is in identify_cpu
35826 +   below. */
35827 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
35828 +{
35829 +       u32 tfms;
35830 +
35831 +       c->loops_per_jiffy = loops_per_jiffy;
35832 +       c->x86_cache_size = -1;
35833 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
35834 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
35835 +       c->x86_vendor_id[0] = '\0'; /* Unset */
35836 +       c->x86_model_id[0] = '\0';  /* Unset */
35837 +       c->x86_clflush_size = 64;
35838 +       c->x86_cache_alignment = c->x86_clflush_size;
35839 +       c->x86_max_cores = 1;
35840 +       c->extended_cpuid_level = 0;
35841 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
35842 +
35843 +       /* Get vendor name */
35844 +       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
35845 +             (unsigned int *)&c->x86_vendor_id[0],
35846 +             (unsigned int *)&c->x86_vendor_id[8],
35847 +             (unsigned int *)&c->x86_vendor_id[4]);
35848 +               
35849 +       get_cpu_vendor(c);
35850 +
35851 +       /* Initialize the standard set of capabilities */
35852 +       /* Note that the vendor-specific code below might override */
35853 +
35854 +       /* Intel-defined flags: level 0x00000001 */
35855 +       if (c->cpuid_level >= 0x00000001) {
35856 +               __u32 misc;
35857 +               cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
35858 +                     &c->x86_capability[0]);
35859 +               c->x86 = (tfms >> 8) & 0xf;
35860 +               c->x86_model = (tfms >> 4) & 0xf;
35861 +               c->x86_mask = tfms & 0xf;
35862 +               if (c->x86 == 0xf)
35863 +                       c->x86 += (tfms >> 20) & 0xff;
35864 +               if (c->x86 >= 0x6)
35865 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
35866 +               if (c->x86_capability[0] & (1<<19)) 
35867 +                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
35868 +       } else {
35869 +               /* Have CPUID level 0 only - unheard of */
35870 +               c->x86 = 4;
35871 +       }
35872 +
35873 +#ifdef CONFIG_SMP
35874 +       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
35875 +#endif
35876 +}
35877 +
35878 +/*
35879 + * This does the hard work of actually picking apart the CPU stuff...
35880 + */
35881 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
35882 +{
35883 +       int i;
35884 +       u32 xlvl;
35885 +
35886 +       early_identify_cpu(c);
35887 +
35888 +       /* AMD-defined flags: level 0x80000001 */
35889 +       xlvl = cpuid_eax(0x80000000);
35890 +       c->extended_cpuid_level = xlvl;
35891 +       if ((xlvl & 0xffff0000) == 0x80000000) {
35892 +               if (xlvl >= 0x80000001) {
35893 +                       c->x86_capability[1] = cpuid_edx(0x80000001);
35894 +                       c->x86_capability[6] = cpuid_ecx(0x80000001);
35895 +               }
35896 +               if (xlvl >= 0x80000004)
35897 +                       get_model_name(c); /* Default name */
35898 +       }
35899 +
35900 +       /* Transmeta-defined flags: level 0x80860001 */
35901 +       xlvl = cpuid_eax(0x80860000);
35902 +       if ((xlvl & 0xffff0000) == 0x80860000) {
35903 +               /* Don't set x86_cpuid_level here for now to not confuse. */
35904 +               if (xlvl >= 0x80860001)
35905 +                       c->x86_capability[2] = cpuid_edx(0x80860001);
35906 +       }
35907 +
35908 +       /*
35909 +        * Vendor-specific initialization.  In this section we
35910 +        * canonicalize the feature flags, meaning if there are
35911 +        * features a certain CPU supports which CPUID doesn't
35912 +        * tell us, CPUID claiming incorrect flags, or other bugs,
35913 +        * we handle them here.
35914 +        *
35915 +        * At the end of this section, c->x86_capability better
35916 +        * indicate the features this CPU genuinely supports!
35917 +        */
35918 +       switch (c->x86_vendor) {
35919 +       case X86_VENDOR_AMD:
35920 +               init_amd(c);
35921 +               break;
35922 +
35923 +       case X86_VENDOR_INTEL:
35924 +               init_intel(c);
35925 +               break;
35926 +
35927 +       case X86_VENDOR_UNKNOWN:
35928 +       default:
35929 +               display_cacheinfo(c);
35930 +               break;
35931 +       }
35932 +
35933 +       select_idle_routine(c);
35934 +       detect_ht(c); 
35935 +
35936 +       /*
35937 +        * On SMP, boot_cpu_data holds the common feature set between
35938 +        * all CPUs; so make sure that we indicate which features are
35939 +        * common between the CPUs.  The first time this routine gets
35940 +        * executed, c == &boot_cpu_data.
35941 +        */
35942 +       if (c != &boot_cpu_data) {
35943 +               /* AND the already accumulated flags with these */
35944 +               for (i = 0 ; i < NCAPINTS ; i++)
35945 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
35946 +       }
35947 +
35948 +#ifdef CONFIG_X86_MCE
35949 +       mcheck_init(c);
35950 +#endif
35951 +       if (c == &boot_cpu_data)
35952 +               mtrr_bp_init();
35953 +       else
35954 +               mtrr_ap_init();
35955 +#ifdef CONFIG_NUMA
35956 +       numa_add_cpu(smp_processor_id());
35957 +#endif
35958 +}
35959
35960 +
35961 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
35962 +{
35963 +       if (c->x86_model_id[0])
35964 +               printk("%s", c->x86_model_id);
35965 +
35966 +       if (c->x86_mask || c->cpuid_level >= 0) 
35967 +               printk(" stepping %02x\n", c->x86_mask);
35968 +       else
35969 +               printk("\n");
35970 +}
35971 +
35972 +/*
35973 + *     Get CPU information for use by the procfs.
35974 + */
35975 +
35976 +static int show_cpuinfo(struct seq_file *m, void *v)
35977 +{
35978 +       struct cpuinfo_x86 *c = v;
35979 +
35980 +       /* 
35981 +        * These flag bits must match the definitions in <asm/cpufeature.h>.
35982 +        * NULL means this bit is undefined or reserved; either way it doesn't
35983 +        * have meaning as far as Linux is concerned.  Note that it's important
35984 +        * to realize there is a difference between this table and CPUID -- if
35985 +        * applications want to get the raw CPUID data, they should access
35986 +        * /dev/cpu/<cpu_nr>/cpuid instead.
35987 +        */
35988 +       static char *x86_cap_flags[] = {
35989 +               /* Intel-defined */
35990 +               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
35991 +               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
35992 +               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
35993 +               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
35994 +
35995 +               /* AMD-defined */
35996 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35997 +               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
35998 +               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
35999 +               NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
36000 +
36001 +               /* Transmeta-defined */
36002 +               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
36003 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36004 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36005 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36006 +
36007 +               /* Other (Linux-defined) */
36008 +               "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
36009 +               "constant_tsc", NULL, NULL,
36010 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36011 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36012 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36013 +
36014 +               /* Intel-defined (#2) */
36015 +               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est",
36016 +               "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
36017 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36018 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36019 +
36020 +               /* VIA/Cyrix/Centaur-defined */
36021 +               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
36022 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36023 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36024 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36025 +
36026 +               /* AMD-defined (#2) */
36027 +               "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
36028 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36029 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36030 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36031 +       };
36032 +       static char *x86_power_flags[] = { 
36033 +               "ts",   /* temperature sensor */
36034 +               "fid",  /* frequency id control */
36035 +               "vid",  /* voltage id control */
36036 +               "ttp",  /* thermal trip */
36037 +               "tm",
36038 +               "stc",
36039 +               NULL,
36040 +               /* nothing */   /* constant_tsc - moved to flags */
36041 +       };
36042 +
36043 +
36044 +#ifdef CONFIG_SMP
36045 +       if (!cpu_online(c-cpu_data))
36046 +               return 0;
36047 +#endif
36048 +
36049 +       seq_printf(m,"processor\t: %u\n"
36050 +                    "vendor_id\t: %s\n"
36051 +                    "cpu family\t: %d\n"
36052 +                    "model\t\t: %d\n"
36053 +                    "model name\t: %s\n",
36054 +                    (unsigned)(c-cpu_data),
36055 +                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
36056 +                    c->x86,
36057 +                    (int)c->x86_model,
36058 +                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
36059 +       
36060 +       if (c->x86_mask || c->cpuid_level >= 0)
36061 +               seq_printf(m, "stepping\t: %d\n", c->x86_mask);
36062 +       else
36063 +               seq_printf(m, "stepping\t: unknown\n");
36064 +       
36065 +       if (cpu_has(c,X86_FEATURE_TSC)) {
36066 +               unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
36067 +               if (!freq)
36068 +                       freq = cpu_khz;
36069 +               seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
36070 +                            freq / 1000, (freq % 1000));
36071 +       }
36072 +
36073 +       /* Cache size */
36074 +       if (c->x86_cache_size >= 0) 
36075 +               seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
36076 +       
36077 +#ifdef CONFIG_SMP
36078 +       if (smp_num_siblings * c->x86_max_cores > 1) {
36079 +               int cpu = c - cpu_data;
36080 +               seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
36081 +               seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
36082 +               seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
36083 +               seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
36084 +       }
36085 +#endif 
36086 +
36087 +       seq_printf(m,
36088 +               "fpu\t\t: yes\n"
36089 +               "fpu_exception\t: yes\n"
36090 +               "cpuid level\t: %d\n"
36091 +               "wp\t\t: yes\n"
36092 +               "flags\t\t:",
36093 +                  c->cpuid_level);
36094 +
36095 +       { 
36096 +               int i; 
36097 +               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
36098 +                       if ( test_bit(i, &c->x86_capability) &&
36099 +                            x86_cap_flags[i] != NULL )
36100 +                               seq_printf(m, " %s", x86_cap_flags[i]);
36101 +       }
36102 +               
36103 +       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
36104 +                  c->loops_per_jiffy/(500000/HZ),
36105 +                  (c->loops_per_jiffy/(5000/HZ)) % 100);
36106 +
36107 +       if (c->x86_tlbsize > 0) 
36108 +               seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
36109 +       seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
36110 +       seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
36111 +
36112 +       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
36113 +                  c->x86_phys_bits, c->x86_virt_bits);
36114 +
36115 +       seq_printf(m, "power management:");
36116 +       {
36117 +               unsigned i;
36118 +               for (i = 0; i < 32; i++) 
36119 +                       if (c->x86_power & (1 << i)) {
36120 +                               if (i < ARRAY_SIZE(x86_power_flags) &&
36121 +                                       x86_power_flags[i])
36122 +                                       seq_printf(m, "%s%s",
36123 +                                               x86_power_flags[i][0]?" ":"",
36124 +                                               x86_power_flags[i]);
36125 +                               else
36126 +                                       seq_printf(m, " [%d]", i);
36127 +                       }
36128 +       }
36129 +
36130 +       seq_printf(m, "\n\n");
36131 +
36132 +       return 0;
36133 +}
36134 +
36135 +static void *c_start(struct seq_file *m, loff_t *pos)
36136 +{
36137 +       return *pos < NR_CPUS ? cpu_data + *pos : NULL;
36138 +}
36139 +
36140 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
36141 +{
36142 +       ++*pos;
36143 +       return c_start(m, pos);
36144 +}
36145 +
36146 +static void c_stop(struct seq_file *m, void *v)
36147 +{
36148 +}
36149 +
36150 +struct seq_operations cpuinfo_op = {
36151 +       .start =c_start,
36152 +       .next = c_next,
36153 +       .stop = c_stop,
36154 +       .show = show_cpuinfo,
36155 +};
36156 +
36157 +static int __init run_dmi_scan(void)
36158 +{
36159 +       dmi_scan_machine();
36160 +       return 0;
36161 +}
36162 +core_initcall(run_dmi_scan);
36163 +
36164 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/smp-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/smp-xen.c
36165 --- ref-linux-2.6.16.9/arch/x86_64/kernel/smp-xen.c     1970-01-01 01:00:00.000000000 +0100
36166 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/smp-xen.c        2006-04-10 00:05:52.000000000 +0200
36167 @@ -0,0 +1,596 @@
36168 +/*
36169 + *     Intel SMP support routines.
36170 + *
36171 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
36172 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
36173 + *      (c) 2002,2003 Andi Kleen, SuSE Labs.
36174 + *
36175 + *     This code is released under the GNU General Public License version 2 or
36176 + *     later.
36177 + */
36178 +
36179 +#include <linux/init.h>
36180 +
36181 +#include <linux/mm.h>
36182 +#include <linux/delay.h>
36183 +#include <linux/spinlock.h>
36184 +#include <linux/smp_lock.h>
36185 +#include <linux/smp.h>
36186 +#include <linux/kernel_stat.h>
36187 +#include <linux/mc146818rtc.h>
36188 +#include <linux/interrupt.h>
36189 +
36190 +#include <asm/mtrr.h>
36191 +#include <asm/pgalloc.h>
36192 +#include <asm/tlbflush.h>
36193 +#include <asm/mach_apic.h>
36194 +#include <asm/mmu_context.h>
36195 +#include <asm/proto.h>
36196 +#include <asm/apicdef.h>
36197 +#include <asm/idle.h>
36198 +#ifdef CONFIG_XEN
36199 +#include <xen/evtchn.h>
36200 +#endif
36201 +
36202 +#ifndef CONFIG_XEN
36203 +/*
36204 + *     Smarter SMP flushing macros. 
36205 + *             c/o Linus Torvalds.
36206 + *
36207 + *     These mean you can really definitely utterly forget about
36208 + *     writing to user space from interrupts. (Its not allowed anyway).
36209 + *
36210 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
36211 + *
36212 + *     More scalable flush, from Andi Kleen
36213 + *
36214 + *     To avoid global state use 8 different call vectors.
36215 + *     Each CPU uses a specific vector to trigger flushes on other
36216 + *     CPUs. Depending on the received vector the target CPUs look into
36217 + *     the right per cpu variable for the flush data.
36218 + *
36219 + *     With more than 8 CPUs they are hashed to the 8 available
36220 + *     vectors. The limited global vector space forces us to this right now.
36221 + *     In future when interrupts are split into per CPU domains this could be
36222 + *     fixed, at the cost of triggering multiple IPIs in some cases.
36223 + */
36224 +
36225 +union smp_flush_state {
36226 +       struct {
36227 +               cpumask_t flush_cpumask;
36228 +               struct mm_struct *flush_mm;
36229 +               unsigned long flush_va;
36230 +#define FLUSH_ALL      -1ULL
36231 +               spinlock_t tlbstate_lock;
36232 +       };
36233 +       char pad[SMP_CACHE_BYTES];
36234 +} ____cacheline_aligned;
36235 +
36236 +/* State is put into the per CPU data section, but padded
36237 +   to a full cache line because other CPUs can access it and we don't
36238 +   want false sharing in the per cpu data segment. */
36239 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
36240 +#endif
36241 +
36242 +/*
36243 + * We cannot call mmdrop() because we are in interrupt context, 
36244 + * instead update mm->cpu_vm_mask.
36245 + */
36246 +static inline void leave_mm(unsigned long cpu)
36247 +{
36248 +       if (read_pda(mmu_state) == TLBSTATE_OK)
36249 +               BUG();
36250 +       clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
36251 +       load_cr3(swapper_pg_dir);
36252 +}
36253 +
36254 +#ifndef CONFIG_XEN
36255 +/*
36256 + *
36257 + * The flush IPI assumes that a thread switch happens in this order:
36258 + * [cpu0: the cpu that switches]
36259 + * 1) switch_mm() either 1a) or 1b)
36260 + * 1a) thread switch to a different mm
36261 + * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
36262 + *     Stop ipi delivery for the old mm. This is not synchronized with
36263 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
36264 + *     for the wrong mm, and in the worst case we perform a superfluous
36265 + *     tlb flush.
36266 + * 1a2) set cpu mmu_state to TLBSTATE_OK
36267 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
36268 + *     was in lazy tlb mode.
36269 + * 1a3) update cpu active_mm
36270 + *     Now cpu0 accepts tlb flushes for the new mm.
36271 + * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
36272 + *     Now the other cpus will send tlb flush ipis.
36273 + * 1a4) change cr3.
36274 + * 1b) thread switch without mm change
36275 + *     cpu active_mm is correct, cpu0 already handles
36276 + *     flush ipis.
36277 + * 1b1) set cpu mmu_state to TLBSTATE_OK
36278 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
36279 + *     Atomically set the bit [other cpus will start sending flush ipis],
36280 + *     and test the bit.
36281 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
36282 + * 2) switch %%esp, ie current
36283 + *
36284 + * The interrupt must handle 2 special cases:
36285 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
36286 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
36287 + *   runs in kernel space, the cpu could load tlb entries for user space
36288 + *   pages.
36289 + *
36290 + * The good news is that cpu mmu_state is local to each cpu, no
36291 + * write/read ordering problems.
36292 + */
36293 +
36294 +/*
36295 + * TLB flush IPI:
36296 + *
36297 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
36298 + * 2) Leave the mm if we are in the lazy tlb mode.
36299 + *
36300 + * Interrupts are disabled.
36301 + */
36302 +
36303 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
36304 +{
36305 +       int cpu;
36306 +       int sender;
36307 +       union smp_flush_state *f;
36308 +
36309 +       cpu = smp_processor_id();
36310 +       /*
36311 +        * orig_rax contains the interrupt vector - 256.
36312 +        * Use that to determine where the sender put the data.
36313 +        */
36314 +       sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
36315 +       f = &per_cpu(flush_state, sender);
36316 +
36317 +       if (!cpu_isset(cpu, f->flush_cpumask))
36318 +               goto out;
36319 +               /* 
36320 +                * This was a BUG() but until someone can quote me the
36321 +                * line from the intel manual that guarantees an IPI to
36322 +                * multiple CPUs is retried _only_ on the erroring CPUs
36323 +                * its staying as a return
36324 +                *
36325 +                * BUG();
36326 +                */
36327 +                
36328 +       if (f->flush_mm == read_pda(active_mm)) {
36329 +               if (read_pda(mmu_state) == TLBSTATE_OK) {
36330 +                       if (f->flush_va == FLUSH_ALL)
36331 +                               local_flush_tlb();
36332 +                       else
36333 +                               __flush_tlb_one(f->flush_va);
36334 +               } else
36335 +                       leave_mm(cpu);
36336 +       }
36337 +out:
36338 +       ack_APIC_irq();
36339 +       cpu_clear(cpu, f->flush_cpumask);
36340 +}
36341 +
36342 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
36343 +                                               unsigned long va)
36344 +{
36345 +       int sender;
36346 +       union smp_flush_state *f;
36347 +
36348 +       /* Caller has disabled preemption */
36349 +       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
36350 +       f = &per_cpu(flush_state, sender);
36351 +
36352 +       /* Could avoid this lock when
36353 +          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
36354 +          probably not worth checking this for a cache-hot lock. */
36355 +       spin_lock(&f->tlbstate_lock);
36356 +
36357 +       f->flush_mm = mm;
36358 +       f->flush_va = va;
36359 +       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
36360 +
36361 +       /*
36362 +        * We have to send the IPI only to
36363 +        * CPUs affected.
36364 +        */
36365 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
36366 +
36367 +       while (!cpus_empty(f->flush_cpumask))
36368 +               cpu_relax();
36369 +
36370 +       f->flush_mm = NULL;
36371 +       f->flush_va = 0;
36372 +       spin_unlock(&f->tlbstate_lock);
36373 +}
36374 +
36375 +int __cpuinit init_smp_flush(void)
36376 +{
36377 +       int i;
36378 +       for_each_cpu_mask(i, cpu_possible_map) {
36379 +               spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
36380 +       }
36381 +       return 0;
36382 +}
36383 +
36384 +core_initcall(init_smp_flush);
36385 +       
36386 +void flush_tlb_current_task(void)
36387 +{
36388 +       struct mm_struct *mm = current->mm;
36389 +       cpumask_t cpu_mask;
36390 +
36391 +       preempt_disable();
36392 +       cpu_mask = mm->cpu_vm_mask;
36393 +       cpu_clear(smp_processor_id(), cpu_mask);
36394 +
36395 +       local_flush_tlb();
36396 +       if (!cpus_empty(cpu_mask))
36397 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
36398 +       preempt_enable();
36399 +}
36400 +
36401 +void flush_tlb_mm (struct mm_struct * mm)
36402 +{
36403 +       cpumask_t cpu_mask;
36404 +
36405 +       preempt_disable();
36406 +       cpu_mask = mm->cpu_vm_mask;
36407 +       cpu_clear(smp_processor_id(), cpu_mask);
36408 +
36409 +       if (current->active_mm == mm) {
36410 +               if (current->mm)
36411 +                       local_flush_tlb();
36412 +               else
36413 +                       leave_mm(smp_processor_id());
36414 +       }
36415 +       if (!cpus_empty(cpu_mask))
36416 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
36417 +
36418 +       preempt_enable();
36419 +}
36420 +
36421 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
36422 +{
36423 +       struct mm_struct *mm = vma->vm_mm;
36424 +       cpumask_t cpu_mask;
36425 +
36426 +       preempt_disable();
36427 +       cpu_mask = mm->cpu_vm_mask;
36428 +       cpu_clear(smp_processor_id(), cpu_mask);
36429 +
36430 +       if (current->active_mm == mm) {
36431 +               if(current->mm)
36432 +                       __flush_tlb_one(va);
36433 +                else
36434 +                       leave_mm(smp_processor_id());
36435 +       }
36436 +
36437 +       if (!cpus_empty(cpu_mask))
36438 +               flush_tlb_others(cpu_mask, mm, va);
36439 +
36440 +       preempt_enable();
36441 +}
36442 +
36443 +static void do_flush_tlb_all(void* info)
36444 +{
36445 +       unsigned long cpu = smp_processor_id();
36446 +
36447 +       __flush_tlb_all();
36448 +       if (read_pda(mmu_state) == TLBSTATE_LAZY)
36449 +               leave_mm(cpu);
36450 +}
36451 +
36452 +void flush_tlb_all(void)
36453 +{
36454 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
36455 +}
36456 +#else
36457 +asmlinkage void smp_invalidate_interrupt (void)
36458 +{ return; }
36459 +void flush_tlb_current_task(void)
36460 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
36461 +void flush_tlb_mm (struct mm_struct * mm)
36462 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
36463 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
36464 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
36465 +void flush_tlb_all(void)
36466 +{ xen_tlb_flush_all(); }
36467 +#endif /* Xen */
36468 +
36469 +/*
36470 + * this function sends a 'reschedule' IPI to another CPU.
36471 + * it goes straight through and wastes no time serializing
36472 + * anything. Worst case is that we lose a reschedule ...
36473 + */
36474 +
36475 +void smp_send_reschedule(int cpu)
36476 +{
36477 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
36478 +}
36479 +
36480 +/*
36481 + * Structure and data for smp_call_function(). This is designed to minimise
36482 + * static memory requirements. It also looks cleaner.
36483 + */
36484 +static DEFINE_SPINLOCK(call_lock);
36485 +
36486 +struct call_data_struct {
36487 +       void (*func) (void *info);
36488 +       void *info;
36489 +       atomic_t started;
36490 +       atomic_t finished;
36491 +       int wait;
36492 +};
36493 +
36494 +static struct call_data_struct * call_data;
36495 +
36496 +void lock_ipi_call_lock(void)
36497 +{
36498 +       spin_lock_irq(&call_lock);
36499 +}
36500 +
36501 +void unlock_ipi_call_lock(void)
36502 +{
36503 +       spin_unlock_irq(&call_lock);
36504 +}
36505 +
36506 +/*
36507 + * this function sends a 'generic call function' IPI to one other CPU
36508 + * in the system.
36509 + *
36510 + * cpu is a standard Linux logical CPU number.
36511 + */
36512 +static void
36513 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
36514 +                               int nonatomic, int wait)
36515 +{
36516 +       struct call_data_struct data;
36517 +       int cpus = 1;
36518 +
36519 +       data.func = func;
36520 +       data.info = info;
36521 +       atomic_set(&data.started, 0);
36522 +       data.wait = wait;
36523 +       if (wait)
36524 +               atomic_set(&data.finished, 0);
36525 +
36526 +       call_data = &data;
36527 +       wmb();
36528 +       /* Send a message to all other CPUs and wait for them to respond */
36529 +       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
36530 +
36531 +       /* Wait for response */
36532 +       while (atomic_read(&data.started) != cpus)
36533 +               cpu_relax();
36534 +
36535 +       if (!wait)
36536 +               return;
36537 +
36538 +       while (atomic_read(&data.finished) != cpus)
36539 +               cpu_relax();
36540 +}
36541 +
36542 +/*
36543 + * smp_call_function_single - Run a function on another CPU
36544 + * @func: The function to run. This must be fast and non-blocking.
36545 + * @info: An arbitrary pointer to pass to the function.
36546 + * @nonatomic: Currently unused.
36547 + * @wait: If true, wait until function has completed on other CPUs.
36548 + *
36549 + * Retrurns 0 on success, else a negative status code.
36550 + *
36551 + * Does not return until the remote CPU is nearly ready to execute <func>
36552 + * or is or has executed.
36553 + */
36554 +
36555 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
36556 +       int nonatomic, int wait)
36557 +{
36558 +       /* prevent preemption and reschedule on another processor */
36559 +       int me = get_cpu();
36560 +       if (cpu == me) {
36561 +               WARN_ON(1);
36562 +               put_cpu();
36563 +               return -EBUSY;
36564 +       }
36565 +       spin_lock_bh(&call_lock);
36566 +       __smp_call_function_single(cpu, func, info, nonatomic, wait);
36567 +       spin_unlock_bh(&call_lock);
36568 +       put_cpu();
36569 +       return 0;
36570 +}
36571 +
36572 +/*
36573 + * this function sends a 'generic call function' IPI to all other CPUs
36574 + * in the system.
36575 + */
36576 +static void __smp_call_function (void (*func) (void *info), void *info,
36577 +                               int nonatomic, int wait)
36578 +{
36579 +       struct call_data_struct data;
36580 +       int cpus = num_online_cpus()-1;
36581 +
36582 +       if (!cpus)
36583 +               return;
36584 +
36585 +       data.func = func;
36586 +       data.info = info;
36587 +       atomic_set(&data.started, 0);
36588 +       data.wait = wait;
36589 +       if (wait)
36590 +               atomic_set(&data.finished, 0);
36591 +
36592 +       call_data = &data;
36593 +       wmb();
36594 +       /* Send a message to all other CPUs and wait for them to respond */
36595 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
36596 +
36597 +       /* Wait for response */
36598 +       while (atomic_read(&data.started) != cpus)
36599 +#ifndef CONFIG_XEN
36600 +               cpu_relax();
36601 +#else
36602 +               barrier();
36603 +#endif
36604 +
36605 +       if (!wait)
36606 +               return;
36607 +
36608 +       while (atomic_read(&data.finished) != cpus)
36609 +#ifndef CONFIG_XEN
36610 +               cpu_relax();
36611 +#else
36612 +               barrier();
36613 +#endif
36614 +}
36615 +
36616 +/*
36617 + * smp_call_function - run a function on all other CPUs.
36618 + * @func: The function to run. This must be fast and non-blocking.
36619 + * @info: An arbitrary pointer to pass to the function.
36620 + * @nonatomic: currently unused.
36621 + * @wait: If true, wait (atomically) until function has completed on other
36622 + *        CPUs.
36623 + *
36624 + * Returns 0 on success, else a negative status code. Does not return until
36625 + * remote CPUs are nearly ready to execute func or are or have executed.
36626 + *
36627 + * You must not call this function with disabled interrupts or from a
36628 + * hardware interrupt handler or from a bottom half handler.
36629 + * Actually there are a few legal cases, like panic.
36630 + */
36631 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
36632 +                       int wait)
36633 +{
36634 +       spin_lock(&call_lock);
36635 +       __smp_call_function(func,info,nonatomic,wait);
36636 +       spin_unlock(&call_lock);
36637 +       return 0;
36638 +}
36639 +
36640 +void smp_stop_cpu(void)
36641 +{
36642 +       unsigned long flags;
36643 +       /*
36644 +        * Remove this CPU:
36645 +        */
36646 +       cpu_clear(smp_processor_id(), cpu_online_map);
36647 +       local_irq_save(flags);
36648 +#ifndef CONFIG_XEN
36649 +       disable_local_APIC();
36650 +#endif
36651 +       local_irq_restore(flags); 
36652 +}
36653 +
36654 +static void smp_really_stop_cpu(void *dummy)
36655 +{
36656 +       smp_stop_cpu(); 
36657 +       for (;;) 
36658 +               asm("hlt"); 
36659 +} 
36660 +
36661 +void smp_send_stop(void)
36662 +{
36663 +       int nolock = 0;
36664 +#ifndef CONFIG_XEN
36665 +       if (reboot_force)
36666 +               return;
36667 +#endif
36668 +       /* Don't deadlock on the call lock in panic */
36669 +       if (!spin_trylock(&call_lock)) {
36670 +               /* ignore locking because we have paniced anyways */
36671 +               nolock = 1;
36672 +       }
36673 +       __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
36674 +       if (!nolock)
36675 +               spin_unlock(&call_lock);
36676 +
36677 +       local_irq_disable();
36678 +#ifndef CONFIG_XEN
36679 +       disable_local_APIC();
36680 +#endif
36681 +       local_irq_enable();
36682 +}
36683 +
36684 +/*
36685 + * Reschedule call back. Nothing to do,
36686 + * all the work is done automatically when
36687 + * we return from the interrupt.
36688 + */
36689 +#ifndef CONFIG_XEN
36690 +asmlinkage void smp_reschedule_interrupt(void)
36691 +#else
36692 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
36693 +#endif
36694 +{
36695 +#ifndef CONFIG_XEN
36696 +       ack_APIC_irq();
36697 +#else
36698 +       return IRQ_HANDLED;
36699 +#endif
36700 +}
36701 +
36702 +#ifndef CONFIG_XEN
36703 +asmlinkage void smp_call_function_interrupt(void)
36704 +#else
36705 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
36706 +#endif
36707 +{
36708 +       void (*func) (void *info) = call_data->func;
36709 +       void *info = call_data->info;
36710 +       int wait = call_data->wait;
36711 +
36712 +#ifndef CONFIG_XEN
36713 +       ack_APIC_irq();
36714 +#endif
36715 +       /*
36716 +        * Notify initiating CPU that I've grabbed the data and am
36717 +        * about to execute the function
36718 +        */
36719 +       mb();
36720 +       atomic_inc(&call_data->started);
36721 +       /*
36722 +        * At this point the info structure may be out of scope unless wait==1
36723 +        */
36724 +       exit_idle();
36725 +       irq_enter();
36726 +       (*func)(info);
36727 +       irq_exit();
36728 +       if (wait) {
36729 +               mb();
36730 +               atomic_inc(&call_data->finished);
36731 +       }
36732 +#ifdef CONFIG_XEN
36733 +       return IRQ_HANDLED;
36734 +#endif
36735 +}
36736 +
36737 +int safe_smp_processor_id(void)
36738 +{
36739 +#ifdef CONFIG_XEN
36740 +       return smp_processor_id();
36741 +#else
36742 +       int apicid, i;
36743 +
36744 +       if (disable_apic)
36745 +               return 0;
36746 +
36747 +       apicid = hard_smp_processor_id();
36748 +       if (x86_cpu_to_apicid[apicid] == apicid)
36749 +               return apicid;
36750 +
36751 +       for (i = 0; i < NR_CPUS; ++i) {
36752 +               if (x86_cpu_to_apicid[i] == apicid)
36753 +                       return i;
36754 +       }
36755 +
36756 +       /* No entries in x86_cpu_to_apicid?  Either no MPS|ACPI,
36757 +        * or called too early.  Either way, we must be CPU 0. */
36758 +       if (x86_cpu_to_apicid[0] == BAD_APICID)
36759 +               return 0;
36760 +
36761 +       return 0; /* Should not happen */
36762 +#endif
36763 +}
36764 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/traps-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/traps-xen.c
36765 --- ref-linux-2.6.16.9/arch/x86_64/kernel/traps-xen.c   1970-01-01 01:00:00.000000000 +0100
36766 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/traps-xen.c      2006-04-10 00:05:52.000000000 +0200
36767 @@ -0,0 +1,1023 @@
36768 +/*
36769 + *  linux/arch/x86-64/traps.c
36770 + *
36771 + *  Copyright (C) 1991, 1992  Linus Torvalds
36772 + *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
36773 + *
36774 + *  Pentium III FXSR, SSE support
36775 + *     Gareth Hughes <gareth@valinux.com>, May 2000
36776 + *
36777 + *  $Id$
36778 + */
36779 +
36780 +/*
36781 + * 'Traps.c' handles hardware traps and faults after we have saved some
36782 + * state in 'entry.S'.
36783 + */
36784 +#include <linux/config.h>
36785 +#include <linux/sched.h>
36786 +#include <linux/kernel.h>
36787 +#include <linux/string.h>
36788 +#include <linux/errno.h>
36789 +#include <linux/ptrace.h>
36790 +#include <linux/timer.h>
36791 +#include <linux/mm.h>
36792 +#include <linux/init.h>
36793 +#include <linux/delay.h>
36794 +#include <linux/spinlock.h>
36795 +#include <linux/interrupt.h>
36796 +#include <linux/module.h>
36797 +#include <linux/moduleparam.h>
36798 +#include <linux/nmi.h>
36799 +#include <linux/kprobes.h>
36800 +
36801 +#include <asm/system.h>
36802 +#include <asm/uaccess.h>
36803 +#include <asm/io.h>
36804 +#include <asm/atomic.h>
36805 +#include <asm/debugreg.h>
36806 +#include <asm/desc.h>
36807 +#include <asm/i387.h>
36808 +#include <asm/kdebug.h>
36809 +#include <asm/processor.h>
36810 +
36811 +#include <asm/smp.h>
36812 +#include <asm/pgalloc.h>
36813 +#include <asm/pda.h>
36814 +#include <asm/proto.h>
36815 +#include <asm/nmi.h>
36816 +
36817 +#ifndef CONFIG_X86_NO_IDT
36818 +extern struct gate_struct idt_table[256]; 
36819 +#endif
36820 +
36821 +asmlinkage void divide_error(void);
36822 +asmlinkage void debug(void);
36823 +asmlinkage void nmi(void);
36824 +asmlinkage void int3(void);
36825 +asmlinkage void overflow(void);
36826 +asmlinkage void bounds(void);
36827 +asmlinkage void invalid_op(void);
36828 +asmlinkage void device_not_available(void);
36829 +asmlinkage void double_fault(void);
36830 +asmlinkage void coprocessor_segment_overrun(void);
36831 +asmlinkage void invalid_TSS(void);
36832 +asmlinkage void segment_not_present(void);
36833 +asmlinkage void stack_segment(void);
36834 +asmlinkage void general_protection(void);
36835 +asmlinkage void page_fault(void);
36836 +asmlinkage void coprocessor_error(void);
36837 +asmlinkage void simd_coprocessor_error(void);
36838 +asmlinkage void reserved(void);
36839 +asmlinkage void alignment_check(void);
36840 +asmlinkage void machine_check(void);
36841 +asmlinkage void spurious_interrupt_bug(void);
36842 +
36843 +struct notifier_block *die_chain;
36844 +static DEFINE_SPINLOCK(die_notifier_lock);
36845 +
36846 +int register_die_notifier(struct notifier_block *nb)
36847 +{
36848 +       int err = 0;
36849 +       unsigned long flags;
36850 +       spin_lock_irqsave(&die_notifier_lock, flags);
36851 +       err = notifier_chain_register(&die_chain, nb);
36852 +       spin_unlock_irqrestore(&die_notifier_lock, flags);
36853 +       return err;
36854 +}
36855 +
36856 +static inline void conditional_sti(struct pt_regs *regs)
36857 +{
36858 +       if (regs->eflags & X86_EFLAGS_IF)
36859 +               local_irq_enable();
36860 +}
36861 +
36862 +static inline void preempt_conditional_sti(struct pt_regs *regs)
36863 +{
36864 +       preempt_disable();
36865 +       if (regs->eflags & X86_EFLAGS_IF)
36866 +               local_irq_enable();
36867 +}
36868 +
36869 +static inline void preempt_conditional_cli(struct pt_regs *regs)
36870 +{
36871 +       if (regs->eflags & X86_EFLAGS_IF)
36872 +               local_irq_disable();
36873 +       preempt_enable_no_resched();
36874 +}
36875 +
36876 +static int kstack_depth_to_print = 10;
36877 +
36878 +#ifdef CONFIG_KALLSYMS
36879 +#include <linux/kallsyms.h> 
36880 +int printk_address(unsigned long address)
36881 +{ 
36882 +       unsigned long offset = 0, symsize;
36883 +       const char *symname;
36884 +       char *modname;
36885 +       char *delim = ":"; 
36886 +       char namebuf[128];
36887 +
36888 +       symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 
36889 +       if (!symname) 
36890 +               return printk("[<%016lx>]", address);
36891 +       if (!modname) 
36892 +               modname = delim = "";           
36893 +        return printk("<%016lx>{%s%s%s%s%+ld}",
36894 +                     address,delim,modname,delim,symname,offset); 
36895 +} 
36896 +#else
36897 +int printk_address(unsigned long address)
36898 +{ 
36899 +       return printk("[<%016lx>]", address);
36900 +} 
36901 +#endif
36902 +
36903 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
36904 +                                       unsigned *usedp, const char **idp)
36905 +{
36906 +#ifndef CONFIG_X86_NO_TSS
36907 +       static char ids[][8] = {
36908 +               [DEBUG_STACK - 1] = "#DB",
36909 +               [NMI_STACK - 1] = "NMI",
36910 +               [DOUBLEFAULT_STACK - 1] = "#DF",
36911 +               [STACKFAULT_STACK - 1] = "#SS",
36912 +               [MCE_STACK - 1] = "#MC",
36913 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36914 +               [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
36915 +#endif
36916 +       };
36917 +       unsigned k;
36918 +
36919 +       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
36920 +               unsigned long end;
36921 +
36922 +               switch (k + 1) {
36923 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36924 +               case DEBUG_STACK:
36925 +                       end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
36926 +                       break;
36927 +#endif
36928 +               default:
36929 +                       end = per_cpu(init_tss, cpu).ist[k];
36930 +                       break;
36931 +               }
36932 +               if (stack >= end)
36933 +                       continue;
36934 +               if (stack >= end - EXCEPTION_STKSZ) {
36935 +                       if (*usedp & (1U << k))
36936 +                               break;
36937 +                       *usedp |= 1U << k;
36938 +                       *idp = ids[k];
36939 +                       return (unsigned long *)end;
36940 +               }
36941 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36942 +               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
36943 +                       unsigned j = N_EXCEPTION_STACKS - 1;
36944 +
36945 +                       do {
36946 +                               ++j;
36947 +                               end -= EXCEPTION_STKSZ;
36948 +                               ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
36949 +                       } while (stack < end - EXCEPTION_STKSZ);
36950 +                       if (*usedp & (1U << j))
36951 +                               break;
36952 +                       *usedp |= 1U << j;
36953 +                       *idp = ids[j];
36954 +                       return (unsigned long *)end;
36955 +               }
36956 +#endif
36957 +       }
36958 +#endif
36959 +       return NULL;
36960 +}
36961 +
36962 +/*
36963 + * x86-64 can have upto three kernel stacks: 
36964 + * process stack
36965 + * interrupt stack
36966 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
36967 + */
36968 +
36969 +void show_trace(unsigned long *stack)
36970 +{
36971 +       const unsigned cpu = safe_smp_processor_id();
36972 +       unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
36973 +       int i;
36974 +       unsigned used = 0;
36975 +
36976 +       printk("\nCall Trace:");
36977 +
36978 +#define HANDLE_STACK(cond) \
36979 +       do while (cond) { \
36980 +               unsigned long addr = *stack++; \
36981 +               if (kernel_text_address(addr)) { \
36982 +                       if (i > 50) { \
36983 +                               printk("\n       "); \
36984 +                               i = 0; \
36985 +                       } \
36986 +                       else \
36987 +                               i += printk(" "); \
36988 +                       /* \
36989 +                        * If the address is either in the text segment of the \
36990 +                        * kernel, or in the region which contains vmalloc'ed \
36991 +                        * memory, it *may* be the address of a calling \
36992 +                        * routine; if so, print it so that someone tracing \
36993 +                        * down the cause of the crash will be able to figure \
36994 +                        * out the call path that was taken. \
36995 +                        */ \
36996 +                       i += printk_address(addr); \
36997 +               } \
36998 +       } while (0)
36999 +
37000 +       for(i = 11; ; ) {
37001 +               const char *id;
37002 +               unsigned long *estack_end;
37003 +               estack_end = in_exception_stack(cpu, (unsigned long)stack,
37004 +                                               &used, &id);
37005 +
37006 +               if (estack_end) {
37007 +                       i += printk(" <%s>", id);
37008 +                       HANDLE_STACK (stack < estack_end);
37009 +                       i += printk(" <EOE>");
37010 +                       stack = (unsigned long *) estack_end[-2];
37011 +                       continue;
37012 +               }
37013 +               if (irqstack_end) {
37014 +                       unsigned long *irqstack;
37015 +                       irqstack = irqstack_end -
37016 +                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
37017 +
37018 +                       if (stack >= irqstack && stack < irqstack_end) {
37019 +                               i += printk(" <IRQ>");
37020 +                               HANDLE_STACK (stack < irqstack_end);
37021 +                               stack = (unsigned long *) (irqstack_end[-1]);
37022 +                               irqstack_end = NULL;
37023 +                               i += printk(" <EOI>");
37024 +                               continue;
37025 +                       }
37026 +               }
37027 +               break;
37028 +       }
37029 +
37030 +       HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
37031 +#undef HANDLE_STACK
37032 +       printk("\n");
37033 +}
37034 +
37035 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
37036 +{
37037 +       unsigned long *stack;
37038 +       int i;
37039 +       const int cpu = safe_smp_processor_id();
37040 +       unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
37041 +       unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
37042 +
37043 +       // debugging aid: "show_stack(NULL, NULL);" prints the
37044 +       // back trace for this cpu.
37045 +
37046 +       if (rsp == NULL) {
37047 +               if (tsk)
37048 +                       rsp = (unsigned long *)tsk->thread.rsp;
37049 +               else
37050 +                       rsp = (unsigned long *)&rsp;
37051 +       }
37052 +
37053 +       stack = rsp;
37054 +       for(i=0; i < kstack_depth_to_print; i++) {
37055 +               if (stack >= irqstack && stack <= irqstack_end) {
37056 +                       if (stack == irqstack_end) {
37057 +                               stack = (unsigned long *) (irqstack_end[-1]);
37058 +                               printk(" <EOI> ");
37059 +                       }
37060 +               } else {
37061 +               if (((long) stack & (THREAD_SIZE-1)) == 0)
37062 +                       break;
37063 +               }
37064 +               if (i && ((i % 4) == 0))
37065 +                       printk("\n       ");
37066 +               printk("%016lx ", *stack++);
37067 +               touch_nmi_watchdog();
37068 +       }
37069 +       show_trace((unsigned long *)rsp);
37070 +}
37071 +
37072 +/*
37073 + * The architecture-independent dump_stack generator
37074 + */
37075 +void dump_stack(void)
37076 +{
37077 +       unsigned long dummy;
37078 +       show_trace(&dummy);
37079 +}
37080 +
37081 +EXPORT_SYMBOL(dump_stack);
37082 +
37083 +void show_registers(struct pt_regs *regs)
37084 +{
37085 +       int i;
37086 +       int in_kernel = !user_mode(regs);
37087 +       unsigned long rsp;
37088 +       const int cpu = safe_smp_processor_id(); 
37089 +       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
37090 +
37091 +               rsp = regs->rsp;
37092 +
37093 +       printk("CPU %d ", cpu);
37094 +       __show_regs(regs);
37095 +       printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
37096 +               cur->comm, cur->pid, task_thread_info(cur), cur);
37097 +
37098 +       /*
37099 +        * When in-kernel, we also print out the stack and code at the
37100 +        * time of the fault..
37101 +        */
37102 +       if (in_kernel) {
37103 +
37104 +               printk("Stack: ");
37105 +               show_stack(NULL, (unsigned long*)rsp);
37106 +
37107 +               printk("\nCode: ");
37108 +               if(regs->rip < PAGE_OFFSET)
37109 +                       goto bad;
37110 +
37111 +               for(i=0;i<20;i++)
37112 +               {
37113 +                       unsigned char c;
37114 +                       if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
37115 +bad:
37116 +                               printk(" Bad RIP value.");
37117 +                               break;
37118 +                       }
37119 +                       printk("%02x ", c);
37120 +               }
37121 +       }
37122 +       printk("\n");
37123 +}      
37124 +
37125 +void handle_BUG(struct pt_regs *regs)
37126 +{ 
37127 +       struct bug_frame f;
37128 +       long len;
37129 +       const char *prefix = "";
37130 +
37131 +       if (user_mode(regs))
37132 +               return; 
37133 +       if (__copy_from_user(&f, (const void __user *) regs->rip,
37134 +                            sizeof(struct bug_frame)))
37135 +               return; 
37136 +       if (f.filename >= 0 ||
37137 +           f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 
37138 +               return;
37139 +       len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
37140 +       if (len < 0 || len >= PATH_MAX)
37141 +               f.filename = (int)(long)"unmapped filename";
37142 +       else if (len > 50) {
37143 +               f.filename += len - 50;
37144 +               prefix = "...";
37145 +       }
37146 +       printk("----------- [cut here ] --------- [please bite here ] ---------\n");
37147 +       printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
37148 +} 
37149 +
37150 +#ifdef CONFIG_BUG
37151 +void out_of_line_bug(void)
37152 +{ 
37153 +       BUG(); 
37154 +} 
37155 +#endif
37156 +
37157 +static DEFINE_SPINLOCK(die_lock);
37158 +static int die_owner = -1;
37159 +
37160 +unsigned __kprobes long oops_begin(void)
37161 +{
37162 +       int cpu = safe_smp_processor_id();
37163 +       unsigned long flags;
37164 +
37165 +       /* racy, but better than risking deadlock. */
37166 +       local_irq_save(flags);
37167 +       if (!spin_trylock(&die_lock)) { 
37168 +               if (cpu == die_owner) 
37169 +                       /* nested oops. should stop eventually */;
37170 +               else
37171 +                       spin_lock(&die_lock);
37172 +       }
37173 +       die_owner = cpu;
37174 +       console_verbose();
37175 +       bust_spinlocks(1);
37176 +       return flags;
37177 +}
37178 +
37179 +void __kprobes oops_end(unsigned long flags)
37180 +{ 
37181 +       die_owner = -1;
37182 +       bust_spinlocks(0);
37183 +       spin_unlock_irqrestore(&die_lock, flags);
37184 +       if (panic_on_oops)
37185 +               panic("Oops");
37186 +}
37187 +
37188 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
37189 +{
37190 +       static int die_counter;
37191 +       printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
37192 +#ifdef CONFIG_PREEMPT
37193 +       printk("PREEMPT ");
37194 +#endif
37195 +#ifdef CONFIG_SMP
37196 +       printk("SMP ");
37197 +#endif
37198 +#ifdef CONFIG_DEBUG_PAGEALLOC
37199 +       printk("DEBUG_PAGEALLOC");
37200 +#endif
37201 +       printk("\n");
37202 +       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
37203 +       show_registers(regs);
37204 +       /* Executive summary in case the oops scrolled away */
37205 +       printk(KERN_ALERT "RIP ");
37206 +       printk_address(regs->rip); 
37207 +       printk(" RSP <%016lx>\n", regs->rsp); 
37208 +}
37209 +
37210 +void die(const char * str, struct pt_regs * regs, long err)
37211 +{
37212 +       unsigned long flags = oops_begin();
37213 +
37214 +       handle_BUG(regs);
37215 +       __die(str, regs, err);
37216 +       oops_end(flags);
37217 +       do_exit(SIGSEGV); 
37218 +}
37219 +
37220 +#ifdef CONFIG_X86_LOCAL_APIC
37221 +void __kprobes die_nmi(char *str, struct pt_regs *regs)
37222 +{
37223 +       unsigned long flags = oops_begin();
37224 +
37225 +       /*
37226 +        * We are in trouble anyway, lets at least try
37227 +        * to get a message out.
37228 +        */
37229 +       printk(str, safe_smp_processor_id());
37230 +       show_registers(regs);
37231 +       if (panic_on_timeout || panic_on_oops)
37232 +               panic("nmi watchdog");
37233 +       printk("console shuts up ...\n");
37234 +       oops_end(flags);
37235 +       do_exit(SIGSEGV);
37236 +}
37237 +#endif
37238 +
37239 +static void __kprobes do_trap(int trapnr, int signr, char *str,
37240 +                             struct pt_regs * regs, long error_code,
37241 +                             siginfo_t *info)
37242 +{
37243 +       struct task_struct *tsk = current;
37244 +
37245 +       conditional_sti(regs);
37246 +
37247 +       tsk->thread.error_code = error_code;
37248 +       tsk->thread.trap_no = trapnr;
37249 +
37250 +       if (user_mode(regs)) {
37251 +               if (exception_trace && unhandled_signal(tsk, signr))
37252 +                       printk(KERN_INFO
37253 +                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
37254 +                              tsk->comm, tsk->pid, str,
37255 +                              regs->rip,regs->rsp,error_code); 
37256 +
37257 +               if (info)
37258 +                       force_sig_info(signr, info, tsk);
37259 +               else
37260 +                       force_sig(signr, tsk);
37261 +               return;
37262 +       }
37263 +
37264 +
37265 +       /* kernel trap */ 
37266 +       {            
37267 +               const struct exception_table_entry *fixup;
37268 +               fixup = search_exception_tables(regs->rip);
37269 +               if (fixup) {
37270 +                       regs->rip = fixup->fixup;
37271 +               } else  
37272 +                       die(str, regs, error_code);
37273 +               return;
37274 +       }
37275 +}
37276 +
37277 +#define DO_ERROR(trapnr, signr, str, name) \
37278 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
37279 +{ \
37280 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
37281 +                                                       == NOTIFY_STOP) \
37282 +               return; \
37283 +       do_trap(trapnr, signr, str, regs, error_code, NULL); \
37284 +}
37285 +
37286 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
37287 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
37288 +{ \
37289 +       siginfo_t info; \
37290 +       info.si_signo = signr; \
37291 +       info.si_errno = 0; \
37292 +       info.si_code = sicode; \
37293 +       info.si_addr = (void __user *)siaddr; \
37294 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
37295 +                                                       == NOTIFY_STOP) \
37296 +               return; \
37297 +       do_trap(trapnr, signr, str, regs, error_code, &info); \
37298 +}
37299 +
37300 +DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
37301 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
37302 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
37303 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
37304 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
37305 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
37306 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
37307 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
37308 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
37309 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
37310 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
37311 +
37312 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
37313 +{
37314 +       static const char str[] = "double fault";
37315 +       struct task_struct *tsk = current;
37316 +
37317 +       /* Return not checked because double check cannot be ignored */
37318 +       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
37319 +
37320 +       tsk->thread.error_code = error_code;
37321 +       tsk->thread.trap_no = 8;
37322 +
37323 +       /* This is always a kernel trap and never fixable (and thus must
37324 +          never return). */
37325 +       for (;;)
37326 +               die(str, regs, error_code);
37327 +}
37328 +
37329 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
37330 +                                               long error_code)
37331 +{
37332 +       struct task_struct *tsk = current;
37333 +
37334 +       conditional_sti(regs);
37335 +
37336 +       tsk->thread.error_code = error_code;
37337 +       tsk->thread.trap_no = 13;
37338 +
37339 +       if (user_mode(regs)) {
37340 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV))
37341 +                       printk(KERN_INFO
37342 +                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
37343 +                              tsk->comm, tsk->pid,
37344 +                              regs->rip,regs->rsp,error_code); 
37345 +
37346 +               force_sig(SIGSEGV, tsk);
37347 +               return;
37348 +       } 
37349 +
37350 +       /* kernel gp */
37351 +       {
37352 +               const struct exception_table_entry *fixup;
37353 +               fixup = search_exception_tables(regs->rip);
37354 +               if (fixup) {
37355 +                       regs->rip = fixup->fixup;
37356 +                       return;
37357 +               }
37358 +               if (notify_die(DIE_GPF, "general protection fault", regs,
37359 +                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
37360 +                       return;
37361 +               die("general protection fault", regs, error_code);
37362 +       }
37363 +}
37364 +
37365 +static __kprobes void
37366 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
37367 +{
37368 +       printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
37369 +       printk("You probably have a hardware problem with your RAM chips\n");
37370 +
37371 +#if 0 /* XEN */
37372 +       /* Clear and disable the memory parity error line. */
37373 +       reason = (reason & 0xf) | 4;
37374 +       outb(reason, 0x61);
37375 +#endif /* XEN */
37376 +}
37377 +
37378 +static __kprobes void
37379 +io_check_error(unsigned char reason, struct pt_regs * regs)
37380 +{
37381 +       printk("NMI: IOCK error (debug interrupt?)\n");
37382 +       show_registers(regs);
37383 +
37384 +#if 0 /* XEN */
37385 +       /* Re-enable the IOCK line, wait for a few seconds */
37386 +       reason = (reason & 0xf) | 8;
37387 +       outb(reason, 0x61);
37388 +       mdelay(2000);
37389 +       reason &= ~8;
37390 +       outb(reason, 0x61);
37391 +#endif /* XEN */
37392 +}
37393 +
37394 +static __kprobes void
37395 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
37396 +{      printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
37397 +       printk("Dazed and confused, but trying to continue\n");
37398 +       printk("Do you have a strange power saving mode enabled?\n");
37399 +}
37400 +
37401 +/* Runs on IST stack. This code must keep interrupts off all the time.
37402 +   Nested NMIs are prevented by the CPU. */
37403 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
37404 +{
37405 +       unsigned char reason = 0;
37406 +       int cpu;
37407 +
37408 +       cpu = smp_processor_id();
37409 +
37410 +       /* Only the BSP gets external NMIs from the system.  */
37411 +       if (!cpu)
37412 +               reason = get_nmi_reason();
37413 +
37414 +       if (!(reason & 0xc0)) {
37415 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
37416 +                                                               == NOTIFY_STOP)
37417 +                       return;
37418 +#ifdef CONFIG_X86_LOCAL_APIC
37419 +               /*
37420 +                * Ok, so this is none of the documented NMI sources,
37421 +                * so it must be the NMI watchdog.
37422 +                */
37423 +               if (nmi_watchdog > 0) {
37424 +                       nmi_watchdog_tick(regs,reason);
37425 +                       return;
37426 +               }
37427 +#endif
37428 +               unknown_nmi_error(reason, regs);
37429 +               return;
37430 +       }
37431 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
37432 +               return; 
37433 +
37434 +       /* AK: following checks seem to be broken on modern chipsets. FIXME */
37435 +
37436 +       if (reason & 0x80)
37437 +               mem_parity_error(reason, regs);
37438 +       if (reason & 0x40)
37439 +               io_check_error(reason, regs);
37440 +}
37441 +
37442 +/* runs on IST stack. */
37443 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
37444 +{
37445 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
37446 +               return;
37447 +       }
37448 +       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
37449 +       return;
37450 +}
37451 +
37452 +/* Help handler running on IST stack to switch back to user stack
37453 +   for scheduling or signal handling. The actual stack switch is done in
37454 +   entry.S */
37455 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
37456 +{
37457 +       struct pt_regs *regs = eregs;
37458 +       /* Did already sync */
37459 +       if (eregs == (struct pt_regs *)eregs->rsp)
37460 +               ;
37461 +       /* Exception from user space */
37462 +       else if (user_mode(eregs))
37463 +               regs = task_pt_regs(current);
37464 +       /* Exception from kernel and interrupts are enabled. Move to
37465 +          kernel process stack. */
37466 +       else if (eregs->eflags & X86_EFLAGS_IF)
37467 +               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
37468 +       if (eregs != regs)
37469 +               *regs = *eregs;
37470 +       return regs;
37471 +}
37472 +
37473 +/* runs on IST stack. */
37474 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
37475 +                                  unsigned long error_code)
37476 +{
37477 +       unsigned long condition;
37478 +       struct task_struct *tsk = current;
37479 +       siginfo_t info;
37480 +
37481 +       get_debugreg(condition, 6);
37482 +
37483 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
37484 +                                               SIGTRAP) == NOTIFY_STOP)
37485 +               return;
37486 +
37487 +       preempt_conditional_sti(regs);
37488 +
37489 +       /* Mask out spurious debug traps due to lazy DR7 setting */
37490 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
37491 +               if (!tsk->thread.debugreg7) { 
37492 +                       goto clear_dr7;
37493 +               }
37494 +       }
37495 +
37496 +       tsk->thread.debugreg6 = condition;
37497 +
37498 +       /* Mask out spurious TF errors due to lazy TF clearing */
37499 +       if (condition & DR_STEP) {
37500 +               /*
37501 +                * The TF error should be masked out only if the current
37502 +                * process is not traced and if the TRAP flag has been set
37503 +                * previously by a tracing process (condition detected by
37504 +                * the PT_DTRACE flag); remember that the i386 TRAP flag
37505 +                * can be modified by the process itself in user mode,
37506 +                * allowing programs to debug themselves without the ptrace()
37507 +                * interface.
37508 +                */
37509 +                if (!user_mode(regs))
37510 +                       goto clear_TF_reenable;
37511 +               /*
37512 +                * Was the TF flag set by a debugger? If so, clear it now,
37513 +                * so that register information is correct.
37514 +                */
37515 +               if (tsk->ptrace & PT_DTRACE) {
37516 +                       regs->eflags &= ~TF_MASK;
37517 +                       tsk->ptrace &= ~PT_DTRACE;
37518 +               }
37519 +       }
37520 +
37521 +       /* Ok, finally something we can handle */
37522 +       tsk->thread.trap_no = 1;
37523 +       tsk->thread.error_code = error_code;
37524 +       info.si_signo = SIGTRAP;
37525 +       info.si_errno = 0;
37526 +       info.si_code = TRAP_BRKPT;
37527 +       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
37528 +       force_sig_info(SIGTRAP, &info, tsk);
37529 +
37530 +clear_dr7:
37531 +       set_debugreg(0UL, 7);
37532 +       preempt_conditional_cli(regs);
37533 +       return;
37534 +
37535 +clear_TF_reenable:
37536 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
37537 +       regs->eflags &= ~TF_MASK;
37538 +       preempt_conditional_cli(regs);
37539 +}
37540 +
37541 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
37542 +{
37543 +       const struct exception_table_entry *fixup;
37544 +       fixup = search_exception_tables(regs->rip);
37545 +       if (fixup) {
37546 +               regs->rip = fixup->fixup;
37547 +               return 1;
37548 +       }
37549 +       notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
37550 +       /* Illegal floating point operation in the kernel */
37551 +       current->thread.trap_no = trapnr;
37552 +       die(str, regs, 0);
37553 +       return 0;
37554 +}
37555 +
37556 +/*
37557 + * Note that we play around with the 'TS' bit in an attempt to get
37558 + * the correct behaviour even in the presence of the asynchronous
37559 + * IRQ13 behaviour
37560 + */
37561 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
37562 +{
37563 +       void __user *rip = (void __user *)(regs->rip);
37564 +       struct task_struct * task;
37565 +       siginfo_t info;
37566 +       unsigned short cwd, swd;
37567 +
37568 +       conditional_sti(regs);
37569 +       if (!user_mode(regs) &&
37570 +           kernel_math_error(regs, "kernel x87 math error", 16))
37571 +               return;
37572 +
37573 +       /*
37574 +        * Save the info for the exception handler and clear the error.
37575 +        */
37576 +       task = current;
37577 +       save_init_fpu(task);
37578 +       task->thread.trap_no = 16;
37579 +       task->thread.error_code = 0;
37580 +       info.si_signo = SIGFPE;
37581 +       info.si_errno = 0;
37582 +       info.si_code = __SI_FAULT;
37583 +       info.si_addr = rip;
37584 +       /*
37585 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
37586 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
37587 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
37588 +        * fault bit.  We should only be taking one exception at a time,
37589 +        * so if this combination doesn't produce any single exception,
37590 +        * then we have a bad program that isn't synchronizing its FPU usage
37591 +        * and it will suffer the consequences since we won't be able to
37592 +        * fully reproduce the context of the exception
37593 +        */
37594 +       cwd = get_fpu_cwd(task);
37595 +       swd = get_fpu_swd(task);
37596 +       switch (swd & ~cwd & 0x3f) {
37597 +               case 0x000:
37598 +               default:
37599 +                       break;
37600 +               case 0x001: /* Invalid Op */
37601 +                       /*
37602 +                        * swd & 0x240 == 0x040: Stack Underflow
37603 +                        * swd & 0x240 == 0x240: Stack Overflow
37604 +                        * User must clear the SF bit (0x40) if set
37605 +                        */
37606 +                       info.si_code = FPE_FLTINV;
37607 +                       break;
37608 +               case 0x002: /* Denormalize */
37609 +               case 0x010: /* Underflow */
37610 +                       info.si_code = FPE_FLTUND;
37611 +                       break;
37612 +               case 0x004: /* Zero Divide */
37613 +                       info.si_code = FPE_FLTDIV;
37614 +                       break;
37615 +               case 0x008: /* Overflow */
37616 +                       info.si_code = FPE_FLTOVF;
37617 +                       break;
37618 +               case 0x020: /* Precision */
37619 +                       info.si_code = FPE_FLTRES;
37620 +                       break;
37621 +       }
37622 +       force_sig_info(SIGFPE, &info, task);
37623 +}
37624 +
37625 +asmlinkage void bad_intr(void)
37626 +{
37627 +       printk("bad interrupt"); 
37628 +}
37629 +
37630 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
37631 +{
37632 +       void __user *rip = (void __user *)(regs->rip);
37633 +       struct task_struct * task;
37634 +       siginfo_t info;
37635 +       unsigned short mxcsr;
37636 +
37637 +       conditional_sti(regs);
37638 +       if (!user_mode(regs) &&
37639 +               kernel_math_error(regs, "kernel simd math error", 19))
37640 +               return;
37641 +
37642 +       /*
37643 +        * Save the info for the exception handler and clear the error.
37644 +        */
37645 +       task = current;
37646 +       save_init_fpu(task);
37647 +       task->thread.trap_no = 19;
37648 +       task->thread.error_code = 0;
37649 +       info.si_signo = SIGFPE;
37650 +       info.si_errno = 0;
37651 +       info.si_code = __SI_FAULT;
37652 +       info.si_addr = rip;
37653 +       /*
37654 +        * The SIMD FPU exceptions are handled a little differently, as there
37655 +        * is only a single status/control register.  Thus, to determine which
37656 +        * unmasked exception was caught we must mask the exception mask bits
37657 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
37658 +        */
37659 +       mxcsr = get_fpu_mxcsr(task);
37660 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
37661 +               case 0x000:
37662 +               default:
37663 +                       break;
37664 +               case 0x001: /* Invalid Op */
37665 +                       info.si_code = FPE_FLTINV;
37666 +                       break;
37667 +               case 0x002: /* Denormalize */
37668 +               case 0x010: /* Underflow */
37669 +                       info.si_code = FPE_FLTUND;
37670 +                       break;
37671 +               case 0x004: /* Zero Divide */
37672 +                       info.si_code = FPE_FLTDIV;
37673 +                       break;
37674 +               case 0x008: /* Overflow */
37675 +                       info.si_code = FPE_FLTOVF;
37676 +                       break;
37677 +               case 0x020: /* Precision */
37678 +                       info.si_code = FPE_FLTRES;
37679 +                       break;
37680 +       }
37681 +       force_sig_info(SIGFPE, &info, task);
37682 +}
37683 +
37684 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
37685 +{
37686 +}
37687 +
37688 +#if 0
37689 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
37690 +{
37691 +}
37692 +#endif
37693 +
37694 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
37695 +{
37696 +}
37697 +
37698 +/*
37699 + *  'math_state_restore()' saves the current math information in the
37700 + * old math state array, and gets the new ones from the current task
37701 + *
37702 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
37703 + * Don't touch unless you *really* know how it works.
37704 + */
37705 +asmlinkage void math_state_restore(void)
37706 +{
37707 +       struct task_struct *me = current;
37708 +        /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
37709 +
37710 +       if (!used_math())
37711 +               init_fpu(me);
37712 +       restore_fpu_checking(&me->thread.i387.fxsave);
37713 +       task_thread_info(me)->status |= TS_USEDFPU;
37714 +}
37715 +
37716 +
37717 +/*
37718 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
37719 + * specify <dpl>|4 in the second field.
37720 + */
37721 +static trap_info_t trap_table[] = {
37722 +        {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
37723 +        {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
37724 +        {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
37725 +        {  4, 3|4, __KERNEL_CS, (unsigned long)overflow                   },
37726 +        {  5, 0|4, __KERNEL_CS, (unsigned long)bounds                     },
37727 +        {  6, 0|4, __KERNEL_CS, (unsigned long)invalid_op                 },
37728 +        {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available       },
37729 +        {  9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
37730 +        { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS                },
37731 +        { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present        },
37732 +        { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment              },
37733 +        { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection         },
37734 +        { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault                 },
37735 +        { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug     },
37736 +        { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error          },
37737 +        { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check            },
37738 +#ifdef CONFIG_X86_MCE
37739 +        { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check              },
37740 +#endif
37741 +        { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
37742 +#ifdef CONFIG_IA32_EMULATION
37743 +       { IA32_SYSCALL_VECTOR, 3|4, __KERNEL_CS, (unsigned long)ia32_syscall},
37744 +#endif
37745 +        {  0, 0,           0, 0                                              }
37746 +};
37747 +
37748 +void __init trap_init(void)
37749 +{
37750 +        int ret;
37751 +
37752 +        ret = HYPERVISOR_set_trap_table(trap_table);
37753 +        
37754 +        if (ret) 
37755 +                printk("HYPERVISOR_set_trap_table faild: error %d\n",
37756 +                       ret);
37757 +
37758 +       /*
37759 +        * Should be a barrier for any external CPU state.
37760 +        */
37761 +       cpu_init();
37762 +}
37763 +
37764 +void smp_trap_init(trap_info_t *trap_ctxt)
37765 +{
37766 +       trap_info_t *t = trap_table;
37767 +
37768 +       for (t = trap_table; t->address; t++) {
37769 +               trap_ctxt[t->vector].flags = t->flags;
37770 +               trap_ctxt[t->vector].cs = t->cs;
37771 +               trap_ctxt[t->vector].address = t->address;
37772 +       }
37773 +}
37774 +
37775 +
37776 +/* Actual parsing is done early in setup.c. */
37777 +static int __init oops_dummy(char *s)
37778 +{ 
37779 +       panic_on_oops = 1;
37780 +       return -1; 
37781 +} 
37782 +__setup("oops=", oops_dummy); 
37783 +
37784 +static int __init kstack_setup(char *s)
37785 +{
37786 +       kstack_depth_to_print = simple_strtoul(s,NULL,0);
37787 +       return 0;
37788 +}
37789 +__setup("kstack=", kstack_setup);
37790 +
37791 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/vsyscall-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/vsyscall-xen.c
37792 --- ref-linux-2.6.16.9/arch/x86_64/kernel/vsyscall-xen.c        1970-01-01 01:00:00.000000000 +0100
37793 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/vsyscall-xen.c   2006-04-10 00:05:52.000000000 +0200
37794 @@ -0,0 +1,239 @@
37795 +/*
37796 + *  linux/arch/x86_64/kernel/vsyscall.c
37797 + *
37798 + *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
37799 + *  Copyright 2003 Andi Kleen, SuSE Labs.
37800 + *
37801 + *  Thanks to hpa@transmeta.com for some useful hint.
37802 + *  Special thanks to Ingo Molnar for his early experience with
37803 + *  a different vsyscall implementation for Linux/IA32 and for the name.
37804 + *
37805 + *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
37806 + *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
37807 + *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
37808 + *  jumping out of line if necessary. We cannot add more with this
37809 + *  mechanism because older kernels won't return -ENOSYS.
37810 + *  If we want more than four we need a vDSO.
37811 + *
37812 + *  Note: the concept clashes with user mode linux. If you use UML and
37813 + *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
37814 + */
37815 +
37816 +#include <linux/time.h>
37817 +#include <linux/init.h>
37818 +#include <linux/kernel.h>
37819 +#include <linux/timer.h>
37820 +#include <linux/seqlock.h>
37821 +#include <linux/jiffies.h>
37822 +#include <linux/sysctl.h>
37823 +
37824 +#include <asm/vsyscall.h>
37825 +#include <asm/pgtable.h>
37826 +#include <asm/page.h>
37827 +#include <asm/fixmap.h>
37828 +#include <asm/errno.h>
37829 +#include <asm/io.h>
37830 +
37831 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
37832 +
37833 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
37834 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
37835 +
37836 +#include <asm/unistd.h>
37837 +
37838 +static __always_inline void timeval_normalize(struct timeval * tv)
37839 +{
37840 +       time_t __sec;
37841 +
37842 +       __sec = tv->tv_usec / 1000000;
37843 +       if (__sec) {
37844 +               tv->tv_usec %= 1000000;
37845 +               tv->tv_sec += __sec;
37846 +       }
37847 +}
37848 +
37849 +static __always_inline void do_vgettimeofday(struct timeval * tv)
37850 +{
37851 +       long sequence, t;
37852 +       unsigned long sec, usec;
37853 +
37854 +       do {
37855 +               sequence = read_seqbegin(&__xtime_lock);
37856 +               
37857 +               sec = __xtime.tv_sec;
37858 +               usec = (__xtime.tv_nsec / 1000) +
37859 +                       (__jiffies - __wall_jiffies) * (1000000 / HZ);
37860 +
37861 +               if (__vxtime.mode != VXTIME_HPET) {
37862 +                       t = get_cycles_sync();
37863 +                       if (t < __vxtime.last_tsc)
37864 +                               t = __vxtime.last_tsc;
37865 +                       usec += ((t - __vxtime.last_tsc) *
37866 +                                __vxtime.tsc_quot) >> 32;
37867 +                       /* See comment in x86_64 do_gettimeofday. */
37868 +               } else {
37869 +                       usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
37870 +                                 __vxtime.last) * __vxtime.quot) >> 32;
37871 +               }
37872 +       } while (read_seqretry(&__xtime_lock, sequence));
37873 +
37874 +       tv->tv_sec = sec + usec / 1000000;
37875 +       tv->tv_usec = usec % 1000000;
37876 +}
37877 +
37878 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
37879 +static __always_inline void do_get_tz(struct timezone * tz)
37880 +{
37881 +       *tz = __sys_tz;
37882 +}
37883 +
37884 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
37885 +{
37886 +       int ret;
37887 +       asm volatile("vsysc2: syscall"
37888 +               : "=a" (ret)
37889 +               : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
37890 +       return ret;
37891 +}
37892 +
37893 +static __always_inline long time_syscall(long *t)
37894 +{
37895 +       long secs;
37896 +       asm volatile("vsysc1: syscall"
37897 +               : "=a" (secs)
37898 +               : "0" (__NR_time),"D" (t) : __syscall_clobber);
37899 +       return secs;
37900 +}
37901 +
37902 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
37903 +{
37904 +       if (unlikely(!__sysctl_vsyscall))
37905 +               return gettimeofday(tv,tz);
37906 +       if (tv)
37907 +               do_vgettimeofday(tv);
37908 +       if (tz)
37909 +               do_get_tz(tz);
37910 +       return 0;
37911 +}
37912 +
37913 +/* This will break when the xtime seconds get inaccurate, but that is
37914 + * unlikely */
37915 +time_t __vsyscall(1) vtime(time_t *t)
37916 +{
37917 +       if (unlikely(!__sysctl_vsyscall))
37918 +               return time_syscall(t);
37919 +       else if (t)
37920 +               *t = __xtime.tv_sec;            
37921 +       return __xtime.tv_sec;
37922 +}
37923 +
37924 +long __vsyscall(2) venosys_0(void)
37925 +{
37926 +       return -ENOSYS;
37927 +}
37928 +
37929 +long __vsyscall(3) venosys_1(void)
37930 +{
37931 +       return -ENOSYS;
37932 +}
37933 +
37934 +#ifdef CONFIG_SYSCTL
37935 +
37936 +#define SYSCALL 0x050f
37937 +#define NOP2    0x9090
37938 +
37939 +/*
37940 + * NOP out syscall in vsyscall page when not needed.
37941 + */
37942 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
37943 +                        void __user *buffer, size_t *lenp, loff_t *ppos)
37944 +{
37945 +       extern u16 vsysc1, vsysc2;
37946 +       u16 *map1, *map2;
37947 +       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
37948 +       if (!write)
37949 +               return ret;
37950 +       /* gcc has some trouble with __va(__pa()), so just do it this
37951 +          way. */
37952 +       map1 = ioremap(__pa_symbol(&vsysc1), 2);
37953 +       if (!map1)
37954 +               return -ENOMEM;
37955 +       map2 = ioremap(__pa_symbol(&vsysc2), 2);
37956 +       if (!map2) {
37957 +               ret = -ENOMEM;
37958 +               goto out;
37959 +       }
37960 +       if (!sysctl_vsyscall) {
37961 +               *map1 = SYSCALL;
37962 +               *map2 = SYSCALL;
37963 +       } else {
37964 +               *map1 = NOP2;
37965 +               *map2 = NOP2;
37966 +       }
37967 +       iounmap(map2);
37968 +out:
37969 +       iounmap(map1);
37970 +       return ret;
37971 +}
37972 +
37973 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
37974 +                               void __user *oldval, size_t __user *oldlenp,
37975 +                               void __user *newval, size_t newlen,
37976 +                               void **context)
37977 +{
37978 +       return -ENOSYS;
37979 +}
37980 +
37981 +static ctl_table kernel_table2[] = {
37982 +       { .ctl_name = 99, .procname = "vsyscall64",
37983 +         .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
37984 +         .strategy = vsyscall_sysctl_nostrat,
37985 +         .proc_handler = vsyscall_sysctl_change },
37986 +       { 0, }
37987 +};
37988 +
37989 +static ctl_table kernel_root_table2[] = {
37990 +       { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
37991 +         .child = kernel_table2 },
37992 +       { 0 },
37993 +};
37994 +
37995 +#endif
37996 +
37997 +static void __init map_vsyscall(void)
37998 +{
37999 +       extern char __vsyscall_0;
38000 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
38001 +
38002 +       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
38003 +}
38004 +
38005 +#ifdef CONFIG_XEN
38006 +static void __init map_vsyscall_user(void)
38007 +{
38008 +       extern void __set_fixmap_user(enum fixed_addresses, unsigned long, pgprot_t);
38009 +       extern char __vsyscall_0;
38010 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
38011 +
38012 +       __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
38013 +}
38014 +#endif
38015 +
38016 +static int __init vsyscall_init(void)
38017 +{
38018 +       BUG_ON(((unsigned long) &vgettimeofday !=
38019 +                       VSYSCALL_ADDR(__NR_vgettimeofday)));
38020 +       BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
38021 +       BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
38022 +       map_vsyscall();
38023 +#ifdef CONFIG_XEN
38024 +       map_vsyscall_user();
38025 +       sysctl_vsyscall = 0; /* disable vgettimeofay() */
38026 +#endif
38027 +#ifdef CONFIG_SYSCTL
38028 +       register_sysctl_table(kernel_root_table2, 0);
38029 +#endif
38030 +       return 0;
38031 +}
38032 +
38033 +__initcall(vsyscall_init);
38034 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/x8664_ksyms-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/kernel/x8664_ksyms-xen.c
38035 --- ref-linux-2.6.16.9/arch/x86_64/kernel/x8664_ksyms-xen.c     1970-01-01 01:00:00.000000000 +0100
38036 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/x8664_ksyms-xen.c        2006-04-10 00:05:52.000000000 +0200
38037 @@ -0,0 +1,163 @@
38038 +#include <linux/config.h>
38039 +#include <linux/module.h>
38040 +#include <linux/smp.h>
38041 +#include <linux/user.h>
38042 +#include <linux/sched.h>
38043 +#include <linux/in6.h>
38044 +#include <linux/interrupt.h>
38045 +#include <linux/smp_lock.h>
38046 +#include <linux/pm.h>
38047 +#include <linux/pci.h>
38048 +#include <linux/apm_bios.h>
38049 +#include <linux/kernel.h>
38050 +#include <linux/string.h>
38051 +#include <linux/syscalls.h>
38052 +#include <linux/tty.h>
38053 +
38054 +#include <asm/semaphore.h>
38055 +#include <asm/processor.h>
38056 +#include <asm/i387.h>
38057 +#include <asm/uaccess.h>
38058 +#include <asm/checksum.h>
38059 +#include <asm/io.h>
38060 +#include <asm/delay.h>
38061 +#include <asm/irq.h>
38062 +#include <asm/mmx.h>
38063 +#include <asm/desc.h>
38064 +#include <asm/pgtable.h>
38065 +#include <asm/pgalloc.h>
38066 +#include <asm/nmi.h>
38067 +#include <asm/kdebug.h>
38068 +#include <asm/unistd.h>
38069 +#include <asm/tlbflush.h>
38070 +#include <asm/kdebug.h>
38071 +
38072 +#ifdef CONFIG_SMP
38073 +extern void __write_lock_failed(rwlock_t *rw);
38074 +extern void __read_lock_failed(rwlock_t *rw);
38075 +#endif
38076 +
38077 +/* platform dependent support */
38078 +EXPORT_SYMBOL(boot_cpu_data);
38079 +//EXPORT_SYMBOL(dump_fpu);
38080 +EXPORT_SYMBOL(kernel_thread);
38081 +EXPORT_SYMBOL(pm_idle);
38082 +EXPORT_SYMBOL(pm_power_off);
38083 +
38084 +EXPORT_SYMBOL(__down_failed);
38085 +EXPORT_SYMBOL(__down_failed_interruptible);
38086 +EXPORT_SYMBOL(__down_failed_trylock);
38087 +EXPORT_SYMBOL(__up_wakeup);
38088 +/* Networking helper routines. */
38089 +EXPORT_SYMBOL(csum_partial_copy_nocheck);
38090 +EXPORT_SYMBOL(ip_compute_csum);
38091 +/* Delay loops */
38092 +EXPORT_SYMBOL(__udelay);
38093 +EXPORT_SYMBOL(__ndelay);
38094 +EXPORT_SYMBOL(__delay);
38095 +EXPORT_SYMBOL(__const_udelay);
38096 +
38097 +EXPORT_SYMBOL(__get_user_1);
38098 +EXPORT_SYMBOL(__get_user_2);
38099 +EXPORT_SYMBOL(__get_user_4);
38100 +EXPORT_SYMBOL(__get_user_8);
38101 +EXPORT_SYMBOL(__put_user_1);
38102 +EXPORT_SYMBOL(__put_user_2);
38103 +EXPORT_SYMBOL(__put_user_4);
38104 +EXPORT_SYMBOL(__put_user_8);
38105 +
38106 +EXPORT_SYMBOL(strncpy_from_user);
38107 +EXPORT_SYMBOL(__strncpy_from_user);
38108 +EXPORT_SYMBOL(clear_user);
38109 +EXPORT_SYMBOL(__clear_user);
38110 +EXPORT_SYMBOL(copy_user_generic);
38111 +EXPORT_SYMBOL(copy_from_user);
38112 +EXPORT_SYMBOL(copy_to_user);
38113 +EXPORT_SYMBOL(copy_in_user);
38114 +EXPORT_SYMBOL(strnlen_user);
38115 +
38116 +#ifdef CONFIG_PCI
38117 +EXPORT_SYMBOL(pci_mem_start);
38118 +#endif
38119 +
38120 +EXPORT_SYMBOL(copy_page);
38121 +EXPORT_SYMBOL(clear_page);
38122 +
38123 +EXPORT_SYMBOL(_cpu_pda);
38124 +#ifdef CONFIG_SMP
38125 +EXPORT_SYMBOL(__write_lock_failed);
38126 +EXPORT_SYMBOL(__read_lock_failed);
38127 +
38128 +EXPORT_SYMBOL(smp_call_function);
38129 +#endif
38130 +
38131 +#ifdef CONFIG_VT
38132 +EXPORT_SYMBOL(screen_info);
38133 +#endif
38134 +
38135 +EXPORT_SYMBOL(get_wchan);
38136 +
38137 +#ifdef CONFIG_X86_LOCAL_APIC
38138 +EXPORT_SYMBOL_GPL(set_nmi_callback);
38139 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
38140 +#endif
38141 +
38142 +/* Export string functions. We normally rely on gcc builtin for most of these,
38143 +   but gcc sometimes decides not to inline them. */    
38144 +#undef memcpy
38145 +#undef memset
38146 +#undef memmove
38147 +#undef strlen
38148 +
38149 +extern void * memset(void *,int,__kernel_size_t);
38150 +extern size_t strlen(const char *);
38151 +extern void * memmove(void * dest,const void *src,size_t count);
38152 +extern void * memcpy(void *,const void *,__kernel_size_t);
38153 +extern void * __memcpy(void *,const void *,__kernel_size_t);
38154 +
38155 +EXPORT_SYMBOL(memset);
38156 +EXPORT_SYMBOL(strlen);
38157 +EXPORT_SYMBOL(memmove);
38158 +EXPORT_SYMBOL(memcpy);
38159 +EXPORT_SYMBOL(__memcpy);
38160 +
38161 +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
38162 +/* prototypes are wrong, these are assembly with custom calling functions */
38163 +extern void rwsem_down_read_failed_thunk(void);
38164 +extern void rwsem_wake_thunk(void);
38165 +extern void rwsem_downgrade_thunk(void);
38166 +extern void rwsem_down_write_failed_thunk(void);
38167 +EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
38168 +EXPORT_SYMBOL(rwsem_wake_thunk);
38169 +EXPORT_SYMBOL(rwsem_downgrade_thunk);
38170 +EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
38171 +#endif
38172 +
38173 +EXPORT_SYMBOL(empty_zero_page);
38174 +
38175 +EXPORT_SYMBOL(die_chain);
38176 +EXPORT_SYMBOL(register_die_notifier);
38177 +
38178 +#ifdef CONFIG_SMP
38179 +EXPORT_SYMBOL(cpu_sibling_map);
38180 +EXPORT_SYMBOL(smp_num_siblings);
38181 +#endif
38182 +
38183 +extern void do_softirq_thunk(void);
38184 +EXPORT_SYMBOL(do_softirq_thunk);
38185 +
38186 +#ifdef CONFIG_BUG
38187 +EXPORT_SYMBOL(out_of_line_bug);
38188 +#endif
38189 +
38190 +EXPORT_SYMBOL(init_level4_pgt);
38191 +
38192 +extern unsigned long __supported_pte_mask;
38193 +EXPORT_SYMBOL(__supported_pte_mask);
38194 +
38195 +#ifdef CONFIG_SMP
38196 +EXPORT_SYMBOL(flush_tlb_page);
38197 +#endif
38198 +
38199 +EXPORT_SYMBOL(load_gs_index);
38200 +
38201 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/kernel/xen_entry.S tmp-linux-2.6-xen.patch/arch/x86_64/kernel/xen_entry.S
38202 --- ref-linux-2.6.16.9/arch/x86_64/kernel/xen_entry.S   1970-01-01 01:00:00.000000000 +0100
38203 +++ tmp-linux-2.6-xen.patch/arch/x86_64/kernel/xen_entry.S      2006-04-10 00:05:52.000000000 +0200
38204 @@ -0,0 +1,40 @@
38205 +/*
38206 + * Copied from arch/xen/i386/kernel/entry.S
38207 + */                        
38208 +/* Offsets into shared_info_t. */                
38209 +#define evtchn_upcall_pending          /* 0 */
38210 +#define evtchn_upcall_mask             1
38211 +
38212 +#define sizeof_vcpu_shift              6
38213 +
38214 +#ifdef CONFIG_SMP
38215 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
38216 +//#define preempt_enable(reg)  decl threadinfo_preempt_count(reg)
38217 +#define preempt_disable(reg)
38218 +#define preempt_enable(reg)
38219 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp)                   ; \
38220 +                               movq %gs:pda_cpunumber,reg              ; \
38221 +                               shl  $32, reg                           ; \
38222 +                               shr  $32-sizeof_vcpu_shift,reg          ; \
38223 +                               addq HYPERVISOR_shared_info,reg
38224 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp)                    ; \
38225 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
38226 +#else
38227 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
38228 +#define XEN_PUT_VCPU_INFO(reg)
38229 +#define XEN_PUT_VCPU_INFO_fixup
38230 +#endif
38231 +
38232 +#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
38233 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
38234 +#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
38235 +                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
38236 +                               XEN_PUT_VCPU_INFO(reg)
38237 +#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  ; \
38238 +                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
38239 +                               XEN_PUT_VCPU_INFO(reg)
38240 +#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
38241 +
38242 +VGCF_IN_SYSCALL = (1<<8)
38243 +        
38244 +       
38245 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/Makefile tmp-linux-2.6-xen.patch/arch/x86_64/Makefile
38246 --- ref-linux-2.6.16.9/arch/x86_64/Makefile     2006-04-19 08:10:14.000000000 +0200
38247 +++ tmp-linux-2.6-xen.patch/arch/x86_64/Makefile        2006-04-10 00:05:52.000000000 +0200
38248 @@ -31,6 +31,10 @@ cflags-$(CONFIG_MK8) += $(call cc-option
38249  cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
38250  CFLAGS += $(cflags-y)
38251  
38252 +cppflags-$(CONFIG_XEN) += \
38253 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
38254 +CPPFLAGS += $(cppflags-y)
38255 +
38256  CFLAGS += -m64
38257  CFLAGS += -mno-red-zone
38258  CFLAGS += -mcmodel=kernel
38259 @@ -70,6 +74,21 @@ boot := arch/x86_64/boot
38260  .PHONY: bzImage bzlilo install archmrproper \
38261         fdimage fdimage144 fdimage288 archclean
38262  
38263 +ifdef CONFIG_XEN
38264 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
38265 +head-y := arch/x86_64/kernel/head-xen.o arch/x86_64/kernel/head64-xen.o arch/x86_64/kernel/init_task.o
38266 +LDFLAGS_vmlinux := -e _start
38267 +boot := arch/i386/boot-xen
38268 +.PHONY: vmlinuz
38269 +#Default target when executing "make"
38270 +all: vmlinuz
38271 +
38272 +vmlinuz: vmlinux
38273 +       $(Q)$(MAKE) $(build)=$(boot) $@
38274 +
38275 +install:
38276 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
38277 +else
38278  #Default target when executing "make"
38279  all: bzImage
38280  
38281 @@ -90,6 +109,7 @@ fdimage fdimage144 fdimage288: vmlinux
38282  
38283  install:
38284         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ 
38285 +endif
38286  
38287  archclean:
38288         $(Q)$(MAKE) $(clean)=$(boot)
38289 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/mm/fault-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/mm/fault-xen.c
38290 --- ref-linux-2.6.16.9/arch/x86_64/mm/fault-xen.c       1970-01-01 01:00:00.000000000 +0100
38291 +++ tmp-linux-2.6-xen.patch/arch/x86_64/mm/fault-xen.c  2006-04-10 00:05:52.000000000 +0200
38292 @@ -0,0 +1,595 @@
38293 +/*
38294 + *  linux/arch/x86-64/mm/fault.c
38295 + *
38296 + *  Copyright (C) 1995  Linus Torvalds
38297 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
38298 + */
38299 +
38300 +#include <linux/config.h>
38301 +#include <linux/signal.h>
38302 +#include <linux/sched.h>
38303 +#include <linux/kernel.h>
38304 +#include <linux/errno.h>
38305 +#include <linux/string.h>
38306 +#include <linux/types.h>
38307 +#include <linux/ptrace.h>
38308 +#include <linux/mman.h>
38309 +#include <linux/mm.h>
38310 +#include <linux/smp.h>
38311 +#include <linux/smp_lock.h>
38312 +#include <linux/interrupt.h>
38313 +#include <linux/init.h>
38314 +#include <linux/tty.h>
38315 +#include <linux/vt_kern.h>             /* For unblank_screen() */
38316 +#include <linux/compiler.h>
38317 +#include <linux/module.h>
38318 +#include <linux/kprobes.h>
38319 +
38320 +#include <asm/system.h>
38321 +#include <asm/uaccess.h>
38322 +#include <asm/pgalloc.h>
38323 +#include <asm/smp.h>
38324 +#include <asm/tlbflush.h>
38325 +#include <asm/proto.h>
38326 +#include <asm/kdebug.h>
38327 +#include <asm-generic/sections.h>
38328 +
38329 +/* Page fault error code bits */
38330 +#define PF_PROT        (1<<0)          /* or no page found */
38331 +#define PF_WRITE       (1<<1)
38332 +#define PF_USER        (1<<2)
38333 +#define PF_RSVD        (1<<3)
38334 +#define PF_INSTR       (1<<4)
38335 +
38336 +void bust_spinlocks(int yes)
38337 +{
38338 +       int loglevel_save = console_loglevel;
38339 +       if (yes) {
38340 +               oops_in_progress = 1;
38341 +       } else {
38342 +#ifdef CONFIG_VT
38343 +               unblank_screen();
38344 +#endif
38345 +               oops_in_progress = 0;
38346 +               /*
38347 +                * OK, the message is on the console.  Now we call printk()
38348 +                * without oops_in_progress set so that printk will give klogd
38349 +                * a poke.  Hold onto your hats...
38350 +                */
38351 +               console_loglevel = 15;          /* NMI oopser may have shut the console up */
38352 +               printk(" ");
38353 +               console_loglevel = loglevel_save;
38354 +       }
38355 +}
38356 +
38357 +/* Sometimes the CPU reports invalid exceptions on prefetch.
38358 +   Check that here and ignore.
38359 +   Opcode checker based on code by Richard Brunner */
38360 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
38361 +                               unsigned long error_code)
38362 +{ 
38363 +       unsigned char *instr;
38364 +       int scan_more = 1;
38365 +       int prefetch = 0; 
38366 +       unsigned char *max_instr;
38367 +
38368 +       /* If it was a exec fault ignore */
38369 +       if (error_code & PF_INSTR)
38370 +               return 0;
38371 +       
38372 +       instr = (unsigned char *)convert_rip_to_linear(current, regs);
38373 +       max_instr = instr + 15;
38374 +
38375 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
38376 +               return 0;
38377 +
38378 +       while (scan_more && instr < max_instr) { 
38379 +               unsigned char opcode;
38380 +               unsigned char instr_hi;
38381 +               unsigned char instr_lo;
38382 +
38383 +               if (__get_user(opcode, instr))
38384 +                       break; 
38385 +
38386 +               instr_hi = opcode & 0xf0; 
38387 +               instr_lo = opcode & 0x0f; 
38388 +               instr++;
38389 +
38390 +               switch (instr_hi) { 
38391 +               case 0x20:
38392 +               case 0x30:
38393 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
38394 +                          prefixes.  In long mode, the CPU will signal
38395 +                          invalid opcode if some of these prefixes are
38396 +                          present so we will never get here anyway */
38397 +                       scan_more = ((instr_lo & 7) == 0x6);
38398 +                       break;
38399 +                       
38400 +               case 0x40:
38401 +                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
38402 +                          Need to figure out under what instruction mode the
38403 +                          instruction was issued ... */
38404 +                       /* Could check the LDT for lm, but for now it's good
38405 +                          enough to assume that long mode only uses well known
38406 +                          segments or kernel. */
38407 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
38408 +                       break;
38409 +                       
38410 +               case 0x60:
38411 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
38412 +                       scan_more = (instr_lo & 0xC) == 0x4;
38413 +                       break;          
38414 +               case 0xF0:
38415 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
38416 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
38417 +                       break;                  
38418 +               case 0x00:
38419 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
38420 +                       scan_more = 0;
38421 +                       if (__get_user(opcode, instr)) 
38422 +                               break;
38423 +                       prefetch = (instr_lo == 0xF) &&
38424 +                               (opcode == 0x0D || opcode == 0x18);
38425 +                       break;                  
38426 +               default:
38427 +                       scan_more = 0;
38428 +                       break;
38429 +               } 
38430 +       }
38431 +       return prefetch;
38432 +}
38433 +
38434 +static int bad_address(void *p) 
38435 +{ 
38436 +       unsigned long dummy;
38437 +       return __get_user(dummy, (unsigned long *)p);
38438 +} 
38439 +
38440 +void dump_pagetable(unsigned long address)
38441 +{
38442 +       pgd_t *pgd;
38443 +       pud_t *pud;
38444 +       pmd_t *pmd;
38445 +       pte_t *pte;
38446 +
38447 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
38448 +       pgd += pgd_index(address);
38449 +       if (bad_address(pgd)) goto bad;
38450 +       printk("PGD %lx ", pgd_val(*pgd));
38451 +       if (!pgd_present(*pgd)) goto ret; 
38452 +
38453 +       pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
38454 +       if (bad_address(pud)) goto bad;
38455 +       printk("PUD %lx ", pud_val(*pud));
38456 +       if (!pud_present(*pud)) goto ret;
38457 +
38458 +       pmd = pmd_offset(pud, address);
38459 +       if (bad_address(pmd)) goto bad;
38460 +       printk("PMD %lx ", pmd_val(*pmd));
38461 +       if (!pmd_present(*pmd)) goto ret;        
38462 +
38463 +       pte = pte_offset_kernel(pmd, address);
38464 +       if (bad_address(pte)) goto bad;
38465 +       printk("PTE %lx", pte_val(*pte)); 
38466 +ret:
38467 +       printk("\n");
38468 +       return;
38469 +bad:
38470 +       printk("BAD\n");
38471 +}
38472 +
38473 +static const char errata93_warning[] = 
38474 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
38475 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
38476 +KERN_ERR "******* Please consider a BIOS update.\n"
38477 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
38478 +
38479 +/* Workaround for K8 erratum #93 & buggy BIOS.
38480 +   BIOS SMM functions are required to use a specific workaround
38481 +   to avoid corruption of the 64bit RIP register on C stepping K8. 
38482 +   A lot of BIOS that didn't get tested properly miss this. 
38483 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
38484 +   Try to work around it here.
38485 +   Note we only handle faults in kernel here. */
38486 +
38487 +static int is_errata93(struct pt_regs *regs, unsigned long address) 
38488 +{
38489 +       static int warned;
38490 +       if (address != regs->rip)
38491 +               return 0;
38492 +       if ((address >> 32) != 0) 
38493 +               return 0;
38494 +       address |= 0xffffffffUL << 32;
38495 +       if ((address >= (u64)_stext && address <= (u64)_etext) || 
38496 +           (address >= MODULES_VADDR && address <= MODULES_END)) { 
38497 +               if (!warned) {
38498 +                       printk(errata93_warning);               
38499 +                       warned = 1;
38500 +               }
38501 +               regs->rip = address;
38502 +               return 1;
38503 +       }
38504 +       return 0;
38505 +} 
38506 +
38507 +int unhandled_signal(struct task_struct *tsk, int sig)
38508 +{
38509 +       if (tsk->pid == 1)
38510 +               return 1;
38511 +       if (tsk->ptrace & PT_PTRACED)
38512 +               return 0;
38513 +       return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
38514 +               (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
38515 +}
38516 +
38517 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
38518 +                                unsigned long error_code)
38519 +{
38520 +       unsigned long flags = oops_begin();
38521 +       struct task_struct *tsk;
38522 +
38523 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
38524 +              current->comm, address);
38525 +       dump_pagetable(address);
38526 +       tsk = current;
38527 +       tsk->thread.cr2 = address;
38528 +       tsk->thread.trap_no = 14;
38529 +       tsk->thread.error_code = error_code;
38530 +       __die("Bad pagetable", regs, error_code);
38531 +       oops_end(flags);
38532 +       do_exit(SIGKILL);
38533 +}
38534 +
38535 +/*
38536 + * Handle a fault on the vmalloc area
38537 + *
38538 + * This assumes no large pages in there.
38539 + */
38540 +static int vmalloc_fault(unsigned long address)
38541 +{
38542 +       pgd_t *pgd, *pgd_ref;
38543 +       pud_t *pud, *pud_ref;
38544 +       pmd_t *pmd, *pmd_ref;
38545 +       pte_t *pte, *pte_ref;
38546 +
38547 +       /* Copy kernel mappings over when needed. This can also
38548 +          happen within a race in page table update. In the later
38549 +          case just flush. */
38550 +
38551 +       /* On Xen the line below does not always work. Needs investigating! */
38552 +       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
38553 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
38554 +       pgd += pgd_index(address);
38555 +       pgd_ref = pgd_offset_k(address);
38556 +       if (pgd_none(*pgd_ref))
38557 +               return -1;
38558 +       if (pgd_none(*pgd))
38559 +               set_pgd(pgd, *pgd_ref);
38560 +
38561 +       /* Below here mismatches are bugs because these lower tables
38562 +          are shared */
38563 +
38564 +       pud = pud_offset(pgd, address);
38565 +       pud_ref = pud_offset(pgd_ref, address);
38566 +       if (pud_none(*pud_ref))
38567 +               return -1;
38568 +       if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
38569 +               BUG();
38570 +       pmd = pmd_offset(pud, address);
38571 +       pmd_ref = pmd_offset(pud_ref, address);
38572 +       if (pmd_none(*pmd_ref))
38573 +               return -1;
38574 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
38575 +               BUG();
38576 +       pte_ref = pte_offset_kernel(pmd_ref, address);
38577 +       if (!pte_present(*pte_ref))
38578 +               return -1;
38579 +       pte = pte_offset_kernel(pmd, address);
38580 +       /* Don't use pte_page here, because the mappings can point
38581 +          outside mem_map, and the NUMA hash lookup cannot handle
38582 +          that. */
38583 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
38584 +               BUG();
38585 +       return 0;
38586 +}
38587 +
38588 +int page_fault_trace = 0;
38589 +int exception_trace = 1;
38590 +
38591 +
38592 +#define MEM_VERBOSE 1
38593 +
38594 +#ifdef MEM_VERBOSE
38595 +#define MEM_LOG(_f, _a...)                     \
38596 +       printk("fault.c:[%d]-> " _f "\n",       \
38597 +       __LINE__ , ## _a )
38598 +#else
38599 +#define MEM_LOG(_f, _a...) ((void)0)
38600 +#endif
38601 +
38602 +/*
38603 + * This routine handles page faults.  It determines the address,
38604 + * and the problem, and then passes it off to one of the appropriate
38605 + * routines.
38606 + */
38607 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
38608 +                                       unsigned long error_code)
38609 +{
38610 +       struct task_struct *tsk;
38611 +       struct mm_struct *mm;
38612 +       struct vm_area_struct * vma;
38613 +       unsigned long address;
38614 +       const struct exception_table_entry *fixup;
38615 +       int write;
38616 +       unsigned long flags;
38617 +       siginfo_t info;
38618 +
38619 +       if (!user_mode(regs))
38620 +               error_code &= ~PF_USER; /* means kernel */
38621 +
38622 +       /* get the address */
38623 +       address = HYPERVISOR_shared_info->vcpu_info[
38624 +               smp_processor_id()].arch.cr2;
38625 +       if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
38626 +                                       SIGSEGV) == NOTIFY_STOP)
38627 +               return;
38628 +
38629 +       if (likely(regs->eflags & X86_EFLAGS_IF))
38630 +               local_irq_enable();
38631 +
38632 +       if (unlikely(page_fault_trace))
38633 +               printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
38634 +                      regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
38635 +
38636 +       tsk = current;
38637 +       mm = tsk->mm;
38638 +       info.si_code = SEGV_MAPERR;
38639 +
38640 +
38641 +       /*
38642 +        * We fault-in kernel-space virtual memory on-demand. The
38643 +        * 'reference' page table is init_mm.pgd.
38644 +        *
38645 +        * NOTE! We MUST NOT take any locks for this case. We may
38646 +        * be in an interrupt or a critical region, and should
38647 +        * only copy the information from the master page table,
38648 +        * nothing more.
38649 +        *
38650 +        * This verifies that the fault happens in kernel space
38651 +        * (error_code & 4) == 0, and that the fault was not a
38652 +        * protection error (error_code & 9) == 0.
38653 +        */
38654 +       if (unlikely(address >= TASK_SIZE64)) {
38655 +               /*
38656 +                * Must check for the entire kernel range here: with writable
38657 +                * page tables the hypervisor may temporarily clear PMD
38658 +                * entries.
38659 +                */
38660 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
38661 +                   address >= PAGE_OFFSET) {
38662 +                       if (vmalloc_fault(address) < 0)
38663 +                               goto bad_area_nosemaphore;
38664 +                       return;
38665 +               }
38666 +               /*
38667 +                * Don't take the mm semaphore here. If we fixup a prefetch
38668 +                * fault we could otherwise deadlock.
38669 +                */
38670 +               goto bad_area_nosemaphore;
38671 +       }
38672 +
38673 +       if (unlikely(error_code & PF_RSVD))
38674 +               pgtable_bad(address, regs, error_code);
38675 +
38676 +       /*
38677 +        * If we're in an interrupt or have no user
38678 +        * context, we must not take the fault..
38679 +        */
38680 +       if (unlikely(in_atomic() || !mm))
38681 +               goto bad_area_nosemaphore;
38682 +
38683 + again:
38684 +       /* When running in the kernel we expect faults to occur only to
38685 +        * addresses in user space.  All other faults represent errors in the
38686 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
38687 +        * erroneous fault occuring in a code path which already holds mmap_sem
38688 +        * we will deadlock attempting to validate the fault against the
38689 +        * address space.  Luckily the kernel only validly references user
38690 +        * space from well defined areas of code, which are listed in the
38691 +        * exceptions table.
38692 +        *
38693 +        * As the vast majority of faults will be valid we will only perform
38694 +        * the source reference check when there is a possibilty of a deadlock.
38695 +        * Attempt to lock the address space, if we cannot we then validate the
38696 +        * source.  If this is invalid we can skip the address space check,
38697 +        * thus avoiding the deadlock.
38698 +        */
38699 +       if (!down_read_trylock(&mm->mmap_sem)) {
38700 +               if ((error_code & PF_USER) == 0 &&
38701 +                   !search_exception_tables(regs->rip))
38702 +                       goto bad_area_nosemaphore;
38703 +               down_read(&mm->mmap_sem);
38704 +       }
38705 +
38706 +       vma = find_vma(mm, address);
38707 +       if (!vma)
38708 +               goto bad_area;
38709 +       if (likely(vma->vm_start <= address))
38710 +               goto good_area;
38711 +       if (!(vma->vm_flags & VM_GROWSDOWN))
38712 +               goto bad_area;
38713 +       if (error_code & 4) {
38714 +               // XXX: align red zone size with ABI 
38715 +               if (address + 128 < regs->rsp)
38716 +                       goto bad_area;
38717 +       }
38718 +       if (expand_stack(vma, address))
38719 +               goto bad_area;
38720 +/*
38721 + * Ok, we have a good vm_area for this memory access, so
38722 + * we can handle it..
38723 + */
38724 +good_area:
38725 +       info.si_code = SEGV_ACCERR;
38726 +       write = 0;
38727 +       switch (error_code & (PF_PROT|PF_WRITE)) {
38728 +               default:        /* 3: write, present */
38729 +                       /* fall through */
38730 +               case PF_WRITE:          /* write, not present */
38731 +                       if (!(vma->vm_flags & VM_WRITE))
38732 +                               goto bad_area;
38733 +                       write++;
38734 +                       break;
38735 +               case PF_PROT:           /* read, present */
38736 +                       goto bad_area;
38737 +               case 0:                 /* read, not present */
38738 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
38739 +                               goto bad_area;
38740 +       }
38741 +
38742 +       /*
38743 +        * If for any reason at all we couldn't handle the fault,
38744 +        * make sure we exit gracefully rather than endlessly redo
38745 +        * the fault.
38746 +        */
38747 +       switch (handle_mm_fault(mm, vma, address, write)) {
38748 +       case VM_FAULT_MINOR:
38749 +               tsk->min_flt++;
38750 +               break;
38751 +       case VM_FAULT_MAJOR:
38752 +               tsk->maj_flt++;
38753 +               break;
38754 +       case VM_FAULT_SIGBUS:
38755 +               goto do_sigbus;
38756 +       default:
38757 +               goto out_of_memory;
38758 +       }
38759 +
38760 +       up_read(&mm->mmap_sem);
38761 +       return;
38762 +
38763 +/*
38764 + * Something tried to access memory that isn't in our memory map..
38765 + * Fix it, but check if it's kernel or user first..
38766 + */
38767 +bad_area:
38768 +       up_read(&mm->mmap_sem);
38769 +
38770 +bad_area_nosemaphore:
38771 +       /* User mode accesses just cause a SIGSEGV */
38772 +       if (error_code & PF_USER) {
38773 +               if (is_prefetch(regs, address, error_code))
38774 +                       return;
38775 +
38776 +               /* Work around K8 erratum #100 K8 in compat mode
38777 +                  occasionally jumps to illegal addresses >4GB.  We
38778 +                  catch this here in the page fault handler because
38779 +                  these addresses are not reachable. Just detect this
38780 +                  case and return.  Any code segment in LDT is
38781 +                  compatibility mode. */
38782 +               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
38783 +                   (address >> 32))
38784 +                       return;
38785 +
38786 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
38787 +                       printk(
38788 +                      "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
38789 +                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
38790 +                                       tsk->comm, tsk->pid, address, regs->rip,
38791 +                                       regs->rsp, error_code);
38792 +               }
38793 +       
38794 +               tsk->thread.cr2 = address;
38795 +               /* Kernel addresses are always protection faults */
38796 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
38797 +               tsk->thread.trap_no = 14;
38798 +               info.si_signo = SIGSEGV;
38799 +               info.si_errno = 0;
38800 +               /* info.si_code has been set above */
38801 +               info.si_addr = (void __user *)address;
38802 +               force_sig_info(SIGSEGV, &info, tsk);
38803 +               return;
38804 +       }
38805 +
38806 +no_context:
38807 +       
38808 +       /* Are we prepared to handle this kernel fault?  */
38809 +       fixup = search_exception_tables(regs->rip);
38810 +       if (fixup) {
38811 +               regs->rip = fixup->fixup;
38812 +               return;
38813 +       }
38814 +
38815 +       /* 
38816 +        * Hall of shame of CPU/BIOS bugs.
38817 +        */
38818 +
38819 +       if (is_prefetch(regs, address, error_code))
38820 +               return;
38821 +
38822 +       if (is_errata93(regs, address))
38823 +               return; 
38824 +
38825 +/*
38826 + * Oops. The kernel tried to access some bad page. We'll have to
38827 + * terminate things with extreme prejudice.
38828 + */
38829 +
38830 +       flags = oops_begin();
38831 +
38832 +       if (address < PAGE_SIZE)
38833 +               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
38834 +       else
38835 +               printk(KERN_ALERT "Unable to handle kernel paging request");
38836 +       printk(" at %016lx RIP: \n" KERN_ALERT,address);
38837 +       printk_address(regs->rip);
38838 +       printk("\n");
38839 +       dump_pagetable(address);
38840 +       tsk->thread.cr2 = address;
38841 +       tsk->thread.trap_no = 14;
38842 +       tsk->thread.error_code = error_code;
38843 +       __die("Oops", regs, error_code);
38844 +       /* Executive summary in case the body of the oops scrolled away */
38845 +       printk(KERN_EMERG "CR2: %016lx\n", address);
38846 +       oops_end(flags);
38847 +       do_exit(SIGKILL);
38848 +
38849 +/*
38850 + * We ran out of memory, or some other thing happened to us that made
38851 + * us unable to handle the page fault gracefully.
38852 + */
38853 +out_of_memory:
38854 +       up_read(&mm->mmap_sem);
38855 +       if (current->pid == 1) { 
38856 +               yield();
38857 +               goto again;
38858 +       }
38859 +       printk("VM: killing process %s\n", tsk->comm);
38860 +       if (error_code & 4)
38861 +               do_exit(SIGKILL);
38862 +       goto no_context;
38863 +
38864 +do_sigbus:
38865 +       up_read(&mm->mmap_sem);
38866 +
38867 +       /* Kernel mode? Handle exceptions or die */
38868 +       if (!(error_code & PF_USER))
38869 +               goto no_context;
38870 +
38871 +       tsk->thread.cr2 = address;
38872 +       tsk->thread.error_code = error_code;
38873 +       tsk->thread.trap_no = 14;
38874 +       info.si_signo = SIGBUS;
38875 +       info.si_errno = 0;
38876 +       info.si_code = BUS_ADRERR;
38877 +       info.si_addr = (void __user *)address;
38878 +       force_sig_info(SIGBUS, &info, tsk);
38879 +       return;
38880 +}
38881 +
38882 +static int __init enable_pagefaulttrace(char *str)
38883 +{
38884 +       page_fault_trace = 1;
38885 +       return 0;
38886 +}
38887 +__setup("pagefaulttrace", enable_pagefaulttrace);
38888 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/mm/init-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/mm/init-xen.c
38889 --- ref-linux-2.6.16.9/arch/x86_64/mm/init-xen.c        1970-01-01 01:00:00.000000000 +0100
38890 +++ tmp-linux-2.6-xen.patch/arch/x86_64/mm/init-xen.c   2006-04-10 00:05:52.000000000 +0200
38891 @@ -0,0 +1,1101 @@
38892 +/*
38893 + *  linux/arch/x86_64/mm/init.c
38894 + *
38895 + *  Copyright (C) 1995  Linus Torvalds
38896 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
38897 + *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
38898 + *
38899 + *  Jun Nakajima <jun.nakajima@intel.com>
38900 + *     Modified for Xen.
38901 + */
38902 +
38903 +#include <linux/config.h>
38904 +#include <linux/signal.h>
38905 +#include <linux/sched.h>
38906 +#include <linux/kernel.h>
38907 +#include <linux/errno.h>
38908 +#include <linux/string.h>
38909 +#include <linux/types.h>
38910 +#include <linux/ptrace.h>
38911 +#include <linux/mman.h>
38912 +#include <linux/mm.h>
38913 +#include <linux/swap.h>
38914 +#include <linux/smp.h>
38915 +#include <linux/init.h>
38916 +#include <linux/pagemap.h>
38917 +#include <linux/bootmem.h>
38918 +#include <linux/proc_fs.h>
38919 +#include <linux/pci.h>
38920 +#include <linux/dma-mapping.h>
38921 +#include <linux/module.h>
38922 +#include <linux/memory_hotplug.h>
38923 +
38924 +#include <asm/processor.h>
38925 +#include <asm/system.h>
38926 +#include <asm/uaccess.h>
38927 +#include <asm/pgtable.h>
38928 +#include <asm/pgalloc.h>
38929 +#include <asm/dma.h>
38930 +#include <asm/fixmap.h>
38931 +#include <asm/e820.h>
38932 +#include <asm/apic.h>
38933 +#include <asm/tlb.h>
38934 +#include <asm/mmu_context.h>
38935 +#include <asm/proto.h>
38936 +#include <asm/smp.h>
38937 +#include <asm/sections.h>
38938 +#include <asm/dma-mapping.h>
38939 +#include <asm/swiotlb.h>
38940 +
38941 +#include <xen/features.h>
38942 +
38943 +#ifndef Dprintk
38944 +#define Dprintk(x...)
38945 +#endif
38946 +
38947 +struct dma_mapping_ops* dma_ops;
38948 +EXPORT_SYMBOL(dma_ops);
38949 +
38950 +extern unsigned long *contiguous_bitmap;
38951 +
38952 +static unsigned long dma_reserve __initdata;
38953 +
38954 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
38955 +extern unsigned long start_pfn;
38956 +
38957 +/*
38958 + * Use this until direct mapping is established, i.e. before __va() is 
38959 + * available in init_memory_mapping().
38960 + */
38961 +
38962 +#define addr_to_page(addr, page)                               \
38963 +       (addr) &= PHYSICAL_PAGE_MASK;                           \
38964 +       (page) = ((unsigned long *) ((unsigned long)            \
38965 +       (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
38966 +       __START_KERNEL_map)))
38967 +
38968 +static void early_make_page_readonly(void *va, unsigned int feature)
38969 +{
38970 +       unsigned long addr, _va = (unsigned long)va;
38971 +       pte_t pte, *ptep;
38972 +       unsigned long *page = (unsigned long *) init_level4_pgt;
38973 +
38974 +       if (xen_feature(feature))
38975 +               return;
38976 +
38977 +       addr = (unsigned long) page[pgd_index(_va)];
38978 +       addr_to_page(addr, page);
38979 +
38980 +       addr = page[pud_index(_va)];
38981 +       addr_to_page(addr, page);
38982 +
38983 +       addr = page[pmd_index(_va)];
38984 +       addr_to_page(addr, page);
38985 +
38986 +       ptep = (pte_t *) &page[pte_index(_va)];
38987 +
38988 +       pte.pte = ptep->pte & ~_PAGE_RW;
38989 +       if (HYPERVISOR_update_va_mapping(_va, pte, 0))
38990 +               BUG();
38991 +}
38992 +
38993 +void make_page_readonly(void *va, unsigned int feature)
38994 +{
38995 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
38996 +       unsigned long addr = (unsigned long) va;
38997 +
38998 +       if (xen_feature(feature))
38999 +               return;
39000 +
39001 +       pgd = pgd_offset_k(addr);
39002 +       pud = pud_offset(pgd, addr);
39003 +       pmd = pmd_offset(pud, addr);
39004 +       ptep = pte_offset_kernel(pmd, addr);
39005 +
39006 +       pte.pte = ptep->pte & ~_PAGE_RW;
39007 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
39008 +               xen_l1_entry_update(ptep, pte); /* fallback */
39009 +
39010 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
39011 +               make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
39012 +}
39013 +
39014 +void make_page_writable(void *va, unsigned int feature)
39015 +{
39016 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
39017 +       unsigned long addr = (unsigned long) va;
39018 +
39019 +       if (xen_feature(feature))
39020 +               return;
39021 +
39022 +       pgd = pgd_offset_k(addr);
39023 +       pud = pud_offset(pgd, addr);
39024 +       pmd = pmd_offset(pud, addr);
39025 +       ptep = pte_offset_kernel(pmd, addr);
39026 +
39027 +       pte.pte = ptep->pte | _PAGE_RW;
39028 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
39029 +               xen_l1_entry_update(ptep, pte); /* fallback */
39030 +
39031 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
39032 +               make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
39033 +}
39034 +
39035 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
39036 +{
39037 +       if (xen_feature(feature))
39038 +               return;
39039 +
39040 +       while (nr-- != 0) {
39041 +               make_page_readonly(va, feature);
39042 +               va = (void*)((unsigned long)va + PAGE_SIZE);
39043 +       }
39044 +}
39045 +
39046 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
39047 +{
39048 +       if (xen_feature(feature))
39049 +               return;
39050 +
39051 +       while (nr-- != 0) {
39052 +               make_page_writable(va, feature);
39053 +               va = (void*)((unsigned long)va + PAGE_SIZE);
39054 +       }
39055 +}
39056 +
39057 +/*
39058 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
39059 + * physical space so we can cache the place of the first one and move
39060 + * around without checking the pgd every time.
39061 + */
39062 +
39063 +void show_mem(void)
39064 +{
39065 +       long i, total = 0, reserved = 0;
39066 +       long shared = 0, cached = 0;
39067 +       pg_data_t *pgdat;
39068 +       struct page *page;
39069 +
39070 +       printk(KERN_INFO "Mem-info:\n");
39071 +       show_free_areas();
39072 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
39073 +
39074 +       for_each_pgdat(pgdat) {
39075 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
39076 +                       page = pfn_to_page(pgdat->node_start_pfn + i);
39077 +                       total++;
39078 +                       if (PageReserved(page))
39079 +                               reserved++;
39080 +                       else if (PageSwapCache(page))
39081 +                               cached++;
39082 +                       else if (page_count(page))
39083 +                               shared += page_count(page) - 1;
39084 +               }
39085 +       }
39086 +       printk(KERN_INFO "%lu pages of RAM\n", total);
39087 +       printk(KERN_INFO "%lu reserved pages\n",reserved);
39088 +       printk(KERN_INFO "%lu pages shared\n",shared);
39089 +       printk(KERN_INFO "%lu pages swap cached\n",cached);
39090 +}
39091 +
39092 +/* References to section boundaries */
39093 +
39094 +int after_bootmem;
39095 +
39096 +static void *spp_getpage(void)
39097 +{ 
39098 +       void *ptr;
39099 +       if (after_bootmem)
39100 +               ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
39101 +       else
39102 +               ptr = alloc_bootmem_pages(PAGE_SIZE);
39103 +       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
39104 +               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
39105 +
39106 +       Dprintk("spp_getpage %p\n", ptr);
39107 +       return ptr;
39108 +} 
39109 +
39110 +#define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
39111 +
39112 +static inline pud_t *pud_offset_u(unsigned long address)
39113 +{
39114 +       pud_t *pud = level3_user_pgt;
39115 +
39116 +       return pud + pud_index(address);
39117 +}
39118 +
39119 +static void set_pte_phys(unsigned long vaddr,
39120 +                        unsigned long phys, pgprot_t prot, int user_mode)
39121 +{
39122 +       pgd_t *pgd;
39123 +       pud_t *pud;
39124 +       pmd_t *pmd;
39125 +       pte_t *pte, new_pte;
39126 +
39127 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
39128 +
39129 +       pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
39130 +       if (pgd_none(*pgd)) {
39131 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
39132 +               return;
39133 +       }
39134 +       pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
39135 +       if (pud_none(*pud)) {
39136 +               pmd = (pmd_t *) spp_getpage(); 
39137 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
39138 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
39139 +               if (pmd != pmd_offset(pud, 0)) {
39140 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
39141 +                       return;
39142 +               }
39143 +       }
39144 +       pmd = pmd_offset(pud, vaddr);
39145 +       if (pmd_none(*pmd)) {
39146 +               pte = (pte_t *) spp_getpage();
39147 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
39148 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
39149 +               if (pte != pte_offset_kernel(pmd, 0)) {
39150 +                       printk("PAGETABLE BUG #02!\n");
39151 +                       return;
39152 +               }
39153 +       }
39154 +       new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
39155 +
39156 +       pte = pte_offset_kernel(pmd, vaddr);
39157 +       if (!pte_none(*pte) &&
39158 +           pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
39159 +               pte_ERROR(*pte);
39160 +       set_pte(pte, new_pte);
39161 +
39162 +       /*
39163 +        * It's enough to flush this one mapping.
39164 +        * (PGE mappings get flushed as well)
39165 +        */
39166 +       __flush_tlb_one(vaddr);
39167 +}
39168 +
39169 +static void set_pte_phys_ma(unsigned long vaddr,
39170 +                        unsigned long phys, pgprot_t prot)
39171 +{
39172 +       pgd_t *pgd;
39173 +       pud_t *pud;
39174 +       pmd_t *pmd;
39175 +       pte_t *pte, new_pte;
39176 +
39177 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
39178 +
39179 +       pgd = pgd_offset_k(vaddr);
39180 +       if (pgd_none(*pgd)) {
39181 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
39182 +               return;
39183 +       }
39184 +       pud = pud_offset(pgd, vaddr);
39185 +       if (pud_none(*pud)) {
39186 +
39187 +               pmd = (pmd_t *) spp_getpage(); 
39188 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
39189 +
39190 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
39191 +
39192 +               if (pmd != pmd_offset(pud, 0)) {
39193 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
39194 +                       return;
39195 +               }
39196 +       }
39197 +       pmd = pmd_offset(pud, vaddr);
39198 +
39199 +       if (pmd_none(*pmd)) {
39200 +               pte = (pte_t *) spp_getpage();
39201 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
39202 +
39203 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
39204 +               if (pte != pte_offset_kernel(pmd, 0)) {
39205 +                       printk("PAGETABLE BUG #02!\n");
39206 +                       return;
39207 +               }
39208 +       }
39209 +
39210 +       new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
39211 +       pte = pte_offset_kernel(pmd, vaddr);
39212 +
39213 +       /* 
39214 +        * Note that the pte page is already RO, thus we want to use
39215 +        * xen_l1_entry_update(), not set_pte().
39216 +        */
39217 +       xen_l1_entry_update(pte, 
39218 +                           pfn_pte_ma(phys >> PAGE_SHIFT, prot));
39219 +
39220 +       /*
39221 +        * It's enough to flush this one mapping.
39222 +        * (PGE mappings get flushed as well)
39223 +        */
39224 +       __flush_tlb_one(vaddr);
39225 +}
39226 +
39227 +#define SET_FIXMAP_KERNEL 0
39228 +#define SET_FIXMAP_USER   1
39229 +
39230 +/* NOTE: this is meant to be run only at boot */
39231 +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
39232 +{
39233 +       unsigned long address = __fix_to_virt(idx);
39234 +
39235 +       if (idx >= __end_of_fixed_addresses) {
39236 +               printk("Invalid __set_fixmap\n");
39237 +               return;
39238 +       }
39239 +       switch (idx) {
39240 +       case VSYSCALL_FIRST_PAGE:
39241 +               set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
39242 +               break;
39243 +       default:
39244 +               set_pte_phys_ma(address, phys, prot);
39245 +               break;
39246 +       }
39247 +}
39248 +
39249 +/*
39250 + * At this point it only supports vsyscall area.
39251 + */
39252 +void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
39253 +{
39254 +       unsigned long address = __fix_to_virt(idx);
39255 +
39256 +       if (idx >= __end_of_fixed_addresses) {
39257 +               printk("Invalid __set_fixmap\n");
39258 +               return;
39259 +       }
39260 +
39261 +       set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
39262 +}
39263 +
39264 +unsigned long __initdata table_start, tables_space; 
39265 +
39266 +unsigned long get_machine_pfn(unsigned long addr)
39267 +{
39268 +       pud_t* pud = pud_offset_k(NULL, addr);
39269 +       pmd_t* pmd = pmd_offset(pud, addr);
39270 +       pte_t *pte = pte_offset_kernel(pmd, addr);
39271 +
39272 +       return pte_mfn(*pte);
39273 +} 
39274 +
39275 +static __meminit void *alloc_static_page(unsigned long *phys)
39276 +{
39277 +       unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
39278 +
39279 +       if (after_bootmem) {
39280 +               void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
39281 +
39282 +               *phys = __pa(adr);
39283 +               return adr;
39284 +       }
39285 +
39286 +       *phys = start_pfn << PAGE_SHIFT;
39287 +       start_pfn++;
39288 +       memset((void *)va, 0, PAGE_SIZE);
39289 +       return (void *)va;
39290 +} 
39291 +
39292 +#define PTE_SIZE PAGE_SIZE
39293 +
39294 +static inline void __set_pte(pte_t *dst, pte_t val)
39295 +{
39296 +       *dst = val;
39297 +}
39298 +
39299 +static inline int make_readonly(unsigned long paddr)
39300 +{
39301 +       int readonly = 0;
39302 +
39303 +       /* Make old and new page tables read-only. */
39304 +       if (!xen_feature(XENFEAT_writable_page_tables)
39305 +           && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
39306 +           && (paddr < ((table_start << PAGE_SHIFT) + tables_space)))
39307 +               readonly = 1;
39308 +       /*
39309 +        * No need for writable mapping of kernel image. This also ensures that
39310 +        * page and descriptor tables embedded inside don't have writable
39311 +        * mappings. 
39312 +        */
39313 +       if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
39314 +               readonly = 1;
39315 +
39316 +       return readonly;
39317 +}
39318 +
39319 +static void __meminit
39320 +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
39321 +{
39322 +       int i, k;
39323 +
39324 +       for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
39325 +               unsigned long pte_phys;
39326 +               pte_t *pte, *pte_save;
39327 +
39328 +               if (address >= end) {
39329 +                       for (; i < PTRS_PER_PMD; i++, pmd++)
39330 +                               set_pmd(pmd, __pmd(0));
39331 +                       break;
39332 +               }
39333 +               pte = alloc_static_page(&pte_phys);
39334 +               pte_save = pte;
39335 +               for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
39336 +                       if ((address >= end) ||
39337 +                           ((address >> PAGE_SHIFT) >=
39338 +                            xen_start_info->nr_pages)) { 
39339 +                               __set_pte(pte, __pte(0)); 
39340 +                               continue;
39341 +                       }
39342 +                       if (make_readonly(address)) {
39343 +                               __set_pte(pte, 
39344 +                                         __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
39345 +                               continue;
39346 +                       }
39347 +                       __set_pte(pte, __pte(address | _KERNPG_TABLE));
39348 +               }
39349 +               pte = pte_save;
39350 +               early_make_page_readonly(pte, XENFEAT_writable_page_tables);
39351 +               set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
39352 +       }
39353 +}
39354 +
39355 +static void __meminit
39356 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
39357 +{
39358 +       pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
39359 +
39360 +       if (pmd_none(*pmd)) {
39361 +               spin_lock(&init_mm.page_table_lock);
39362 +               phys_pmd_init(pmd, address, end);
39363 +               spin_unlock(&init_mm.page_table_lock);
39364 +               __flush_tlb_all();
39365 +       }
39366 +}
39367 +
39368 +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
39369 +{ 
39370 +       long i = pud_index(address);
39371 +
39372 +       pud = pud + i;
39373 +
39374 +       if (after_bootmem && pud_val(*pud)) {
39375 +               phys_pmd_update(pud, address, end);
39376 +               return;
39377 +       }
39378 +
39379 +       for (; i < PTRS_PER_PUD; pud++, i++) {
39380 +               unsigned long paddr, pmd_phys;
39381 +               pmd_t *pmd;
39382 +
39383 +               paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
39384 +               if (paddr >= end)
39385 +                       break;
39386 +
39387 +               pmd = alloc_static_page(&pmd_phys);
39388 +               early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
39389 +               spin_lock(&init_mm.page_table_lock);
39390 +               set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
39391 +               phys_pmd_init(pmd, paddr, end);
39392 +               spin_unlock(&init_mm.page_table_lock);
39393 +       }
39394 +       __flush_tlb();
39395 +} 
39396 +
39397 +void __init xen_init_pt(void)
39398 +{
39399 +       unsigned long addr, *page;
39400 +
39401 +       memset((void *)init_level4_pgt,   0, PAGE_SIZE);
39402 +       memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
39403 +       memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
39404 +
39405 +       /* Find the initial pte page that was built for us. */
39406 +       page = (unsigned long *)xen_start_info->pt_base;
39407 +       addr = page[pgd_index(__START_KERNEL_map)];
39408 +       addr_to_page(addr, page);
39409 +       addr = page[pud_index(__START_KERNEL_map)];
39410 +       addr_to_page(addr, page);
39411 +
39412 +       /* Construct mapping of initial pte page in our own directories. */
39413 +       init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
39414 +               mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
39415 +       level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
39416 +               __pud(__pa_symbol(level2_kernel_pgt) |
39417 +                     _KERNPG_TABLE | _PAGE_USER);
39418 +       memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
39419 +
39420 +       early_make_page_readonly(init_level4_pgt,
39421 +                                XENFEAT_writable_page_tables);
39422 +       early_make_page_readonly(init_level4_user_pgt,
39423 +                                XENFEAT_writable_page_tables);
39424 +       early_make_page_readonly(level3_kernel_pgt,
39425 +                                XENFEAT_writable_page_tables);
39426 +       early_make_page_readonly(level3_user_pgt,
39427 +                                XENFEAT_writable_page_tables);
39428 +       early_make_page_readonly(level2_kernel_pgt,
39429 +                                XENFEAT_writable_page_tables);
39430 +
39431 +       xen_pgd_pin(__pa_symbol(init_level4_pgt));
39432 +       xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
39433 +
39434 +       set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
39435 +               mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
39436 +}
39437 +
39438 +void __init extend_init_mapping(void) 
39439 +{
39440 +       unsigned long va = __START_KERNEL_map;
39441 +       unsigned long phys, addr, *pte_page;
39442 +       pmd_t *pmd;
39443 +       pte_t *pte, new_pte;
39444 +       unsigned long *page = (unsigned long *)init_level4_pgt;
39445 +
39446 +       addr = page[pgd_index(va)];
39447 +       addr_to_page(addr, page);
39448 +       addr = page[pud_index(va)];
39449 +       addr_to_page(addr, page);
39450 +
39451 +       /* Kill mapping of low 1MB. */
39452 +       while (va < (unsigned long)&_text) {
39453 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
39454 +               va += PAGE_SIZE;
39455 +       }
39456 +
39457 +       /* Ensure init mappings cover kernel text/data and initial tables. */
39458 +       while (va < (__START_KERNEL_map
39459 +                    + (start_pfn << PAGE_SHIFT)
39460 +                    + tables_space)) {
39461 +               pmd = (pmd_t *)&page[pmd_index(va)];
39462 +               if (pmd_none(*pmd)) {
39463 +                       pte_page = alloc_static_page(&phys);
39464 +                       early_make_page_readonly(
39465 +                               pte_page, XENFEAT_writable_page_tables);
39466 +                       set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
39467 +               } else {
39468 +                       addr = page[pmd_index(va)];
39469 +                       addr_to_page(addr, pte_page);
39470 +               }
39471 +               pte = (pte_t *)&pte_page[pte_index(va)];
39472 +               if (pte_none(*pte)) {
39473 +                       new_pte = pfn_pte(
39474 +                               (va - __START_KERNEL_map) >> PAGE_SHIFT, 
39475 +                               __pgprot(_KERNPG_TABLE | _PAGE_USER));
39476 +                       xen_l1_entry_update(pte, new_pte);
39477 +               }
39478 +               va += PAGE_SIZE;
39479 +       }
39480 +
39481 +       /* Finally, blow away any spurious initial mappings. */
39482 +       while (1) {
39483 +               pmd = (pmd_t *)&page[pmd_index(va)];
39484 +               if (pmd_none(*pmd))
39485 +                       break;
39486 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
39487 +               va += PAGE_SIZE;
39488 +       }
39489 +}
39490 +
39491 +static void __init find_early_table_space(unsigned long end)
39492 +{
39493 +       unsigned long puds, pmds, ptes; 
39494 +
39495 +       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
39496 +       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
39497 +       ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
39498 +
39499 +       tables_space =
39500 +               round_up(puds * 8, PAGE_SIZE) + 
39501 +               round_up(pmds * 8, PAGE_SIZE) + 
39502 +               round_up(ptes * 8, PAGE_SIZE); 
39503 +
39504 +       extend_init_mapping();
39505 +
39506 +       table_start = start_pfn;
39507 +
39508 +       early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
39509 +               end, table_start << PAGE_SHIFT, start_pfn << PAGE_SHIFT);
39510 +}
39511 +
39512 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
39513 +   This runs before bootmem is initialized and gets pages directly from the 
39514 +   physical memory. To access them they are temporarily mapped. */
39515 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
39516 +{ 
39517 +       unsigned long next; 
39518 +
39519 +       Dprintk("init_memory_mapping\n");
39520 +
39521 +       /* 
39522 +        * Find space for the kernel direct mapping tables.
39523 +        * Later we should allocate these tables in the local node of the memory
39524 +        * mapped.  Unfortunately this is done currently before the nodes are 
39525 +        * discovered.
39526 +        */
39527 +       if (!after_bootmem)
39528 +               find_early_table_space(end);
39529 +
39530 +       start = (unsigned long)__va(start);
39531 +       end = (unsigned long)__va(end);
39532 +
39533 +       for (; start < end; start = next) {
39534 +               unsigned long pud_phys; 
39535 +               pgd_t *pgd = pgd_offset_k(start);
39536 +               pud_t *pud;
39537 +
39538 +               if (after_bootmem) {
39539 +                       pud = pud_offset_k(pgd, __PAGE_OFFSET);
39540 +                       make_page_readonly(pud, XENFEAT_writable_page_tables);
39541 +                       pud_phys = __pa(pud);
39542 +               } else {
39543 +                       pud = alloc_static_page(&pud_phys);
39544 +                       early_make_page_readonly(pud, XENFEAT_writable_page_tables);
39545 +               }
39546 +               next = start + PGDIR_SIZE;
39547 +               if (next > end) 
39548 +                       next = end; 
39549 +               phys_pud_init(pud, __pa(start), __pa(next));
39550 +               if (!after_bootmem)
39551 +                       set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
39552 +       }
39553 +
39554 +       BUG_ON(!after_bootmem && start_pfn != table_start + (tables_space >> PAGE_SHIFT));
39555 +
39556 +       __flush_tlb_all();
39557 +}
39558 +
39559 +void __cpuinit zap_low_mappings(int cpu)
39560 +{
39561 +       /* this is not required for Xen */
39562 +#if 0
39563 +       swap_low_mappings();
39564 +#endif
39565 +}
39566 +
39567 +/* Compute zone sizes for the DMA and DMA32 zones in a node. */
39568 +__init void
39569 +size_zones(unsigned long *z, unsigned long *h,
39570 +          unsigned long start_pfn, unsigned long end_pfn)
39571 +{
39572 +       int i;
39573 +#ifndef CONFIG_XEN
39574 +       unsigned long w;
39575 +#endif
39576 +
39577 +       for (i = 0; i < MAX_NR_ZONES; i++)
39578 +               z[i] = 0;
39579 +
39580 +#ifndef CONFIG_XEN
39581 +       if (start_pfn < MAX_DMA_PFN)
39582 +               z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
39583 +       if (start_pfn < MAX_DMA32_PFN) {
39584 +               unsigned long dma32_pfn = MAX_DMA32_PFN;
39585 +               if (dma32_pfn > end_pfn)
39586 +                       dma32_pfn = end_pfn;
39587 +               z[ZONE_DMA32] = dma32_pfn - start_pfn;
39588 +       }
39589 +       z[ZONE_NORMAL] = end_pfn - start_pfn;
39590 +
39591 +       /* Remove lower zones from higher ones. */
39592 +       w = 0;
39593 +       for (i = 0; i < MAX_NR_ZONES; i++) {
39594 +               if (z[i])
39595 +                       z[i] -= w;
39596 +               w += z[i];
39597 +       }
39598 +
39599 +       /* Compute holes */
39600 +       w = start_pfn;
39601 +       for (i = 0; i < MAX_NR_ZONES; i++) {
39602 +               unsigned long s = w;
39603 +               w += z[i];
39604 +               h[i] = e820_hole_size(s, w);
39605 +       }
39606 +
39607 +       /* Add the space pace needed for mem_map to the holes too. */
39608 +       for (i = 0; i < MAX_NR_ZONES; i++)
39609 +               h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
39610 +
39611 +       /* The 16MB DMA zone has the kernel and other misc mappings.
39612 +          Account them too */
39613 +       if (h[ZONE_DMA]) {
39614 +               h[ZONE_DMA] += dma_reserve;
39615 +               if (h[ZONE_DMA] >= z[ZONE_DMA]) {
39616 +                       printk(KERN_WARNING
39617 +                               "Kernel too large and filling up ZONE_DMA?\n");
39618 +                       h[ZONE_DMA] = z[ZONE_DMA];
39619 +               }
39620 +       }
39621 +#else
39622 +       z[ZONE_DMA] = end_pfn;
39623 +       for (i = 0; i < MAX_NR_ZONES; i++)
39624 +               h[i] = 0;
39625 +#endif
39626 +}
39627 +
39628 +#ifndef CONFIG_NUMA
39629 +void __init paging_init(void)
39630 +{
39631 +       unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
39632 +       int i;
39633 +
39634 +       memory_present(0, 0, end_pfn);
39635 +       sparse_init();
39636 +       size_zones(zones, holes, 0, end_pfn);
39637 +       free_area_init_node(0, NODE_DATA(0), zones,
39638 +                           __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
39639 +
39640 +       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
39641 +           xen_start_info->shared_info >= xen_start_info->nr_pages) {
39642 +               /* Switch to the real shared_info page, and clear the
39643 +                * dummy page. */
39644 +               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
39645 +               HYPERVISOR_shared_info =
39646 +                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
39647 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
39648 +       }
39649 +
39650 +       init_mm.context.pinned = 1;
39651 +
39652 +       /* Setup mapping of lower 1st MB */
39653 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
39654 +               if (xen_start_info->flags & SIF_PRIVILEGED)
39655 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
39656 +               else
39657 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
39658 +                                    virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
39659 +                                    PAGE_KERNEL_RO);
39660 +}
39661 +#endif
39662 +
39663 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
39664 +   from the CPU leading to inconsistent cache lines. address and size
39665 +   must be aligned to 2MB boundaries. 
39666 +   Does nothing when the mapping doesn't exist. */
39667 +void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
39668 +{
39669 +       unsigned long end = address + size;
39670 +
39671 +       BUG_ON(address & ~LARGE_PAGE_MASK);
39672 +       BUG_ON(size & ~LARGE_PAGE_MASK); 
39673 +       
39674 +       for (; address < end; address += LARGE_PAGE_SIZE) { 
39675 +               pgd_t *pgd = pgd_offset_k(address);
39676 +               pud_t *pud;
39677 +               pmd_t *pmd;
39678 +               if (pgd_none(*pgd))
39679 +                       continue;
39680 +               pud = pud_offset(pgd, address);
39681 +               if (pud_none(*pud))
39682 +                       continue; 
39683 +               pmd = pmd_offset(pud, address);
39684 +               if (!pmd || pmd_none(*pmd))
39685 +                       continue; 
39686 +               if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
39687 +                       /* Could handle this, but it should not happen currently. */
39688 +                       printk(KERN_ERR 
39689 +              "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
39690 +                       pmd_ERROR(*pmd); 
39691 +               }
39692 +               set_pmd(pmd, __pmd(0));                 
39693 +       }
39694 +       __flush_tlb_all();
39695 +} 
39696 +
39697 +/*
39698 + * Memory hotplug specific functions
39699 + * These are only for non-NUMA machines right now.
39700 + */
39701 +#ifdef CONFIG_MEMORY_HOTPLUG
39702 +
39703 +void online_page(struct page *page)
39704 +{
39705 +       ClearPageReserved(page);
39706 +       set_page_count(page, 1);
39707 +       __free_page(page);
39708 +       totalram_pages++;
39709 +       num_physpages++;
39710 +}
39711 +
39712 +int add_memory(u64 start, u64 size)
39713 +{
39714 +       struct pglist_data *pgdat = NODE_DATA(0);
39715 +       struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
39716 +       unsigned long start_pfn = start >> PAGE_SHIFT;
39717 +       unsigned long nr_pages = size >> PAGE_SHIFT;
39718 +       int ret;
39719 +
39720 +       ret = __add_pages(zone, start_pfn, nr_pages);
39721 +       if (ret)
39722 +               goto error;
39723 +
39724 +       init_memory_mapping(start, (start + size -1));
39725 +
39726 +       return ret;
39727 +error:
39728 +       printk("%s: Problem encountered in __add_pages!\n", __func__);
39729 +       return ret;
39730 +}
39731 +EXPORT_SYMBOL_GPL(add_memory);
39732 +
39733 +int remove_memory(u64 start, u64 size)
39734 +{
39735 +       return -EINVAL;
39736 +}
39737 +EXPORT_SYMBOL_GPL(remove_memory);
39738 +
39739 +#endif
39740 +
39741 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
39742 +                        kcore_vsyscall;
39743 +
39744 +void __init mem_init(void)
39745 +{
39746 +       long codesize, reservedpages, datasize, initsize;
39747 +
39748 +       contiguous_bitmap = alloc_bootmem_low_pages(
39749 +               (end_pfn + 2*BITS_PER_LONG) >> 3);
39750 +       BUG_ON(!contiguous_bitmap);
39751 +       memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
39752 +
39753 +#if defined(CONFIG_SWIOTLB)
39754 +       pci_swiotlb_init();     
39755 +#endif
39756 +       no_iommu_init();
39757 +
39758 +       /* How many end-of-memory variables you have, grandma! */
39759 +       max_low_pfn = end_pfn;
39760 +       max_pfn = end_pfn;
39761 +       num_physpages = end_pfn;
39762 +       high_memory = (void *) __va(end_pfn * PAGE_SIZE);
39763 +
39764 +       /* clear the zero-page */
39765 +       memset(empty_zero_page, 0, PAGE_SIZE);
39766 +
39767 +       reservedpages = 0;
39768 +
39769 +       /* this will put all low memory onto the freelists */
39770 +#ifdef CONFIG_NUMA
39771 +       totalram_pages = numa_free_all_bootmem();
39772 +#else
39773 +       totalram_pages = free_all_bootmem();
39774 +#endif
39775 +       reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
39776 +
39777 +       after_bootmem = 1;
39778 +
39779 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
39780 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
39781 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
39782 +
39783 +       /* Register memory areas for /proc/kcore */
39784 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
39785 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
39786 +                  VMALLOC_END-VMALLOC_START);
39787 +       kclist_add(&kcore_kernel, &_stext, _end - _stext);
39788 +       kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
39789 +       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
39790 +                                VSYSCALL_END - VSYSCALL_START);
39791 +
39792 +       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
39793 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
39794 +               end_pfn << (PAGE_SHIFT-10),
39795 +               codesize >> 10,
39796 +               reservedpages << (PAGE_SHIFT-10),
39797 +               datasize >> 10,
39798 +               initsize >> 10);
39799 +
39800 +#ifndef CONFIG_XEN
39801 +#ifdef CONFIG_SMP
39802 +       /*
39803 +        * Sync boot_level4_pgt mappings with the init_level4_pgt
39804 +        * except for the low identity mappings which are already zapped
39805 +        * in init_level4_pgt. This sync-up is essential for AP's bringup
39806 +        */
39807 +       memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
39808 +#endif
39809 +#endif
39810 +}
39811 +
39812 +void free_initmem(void)
39813 +{
39814 +#ifdef __DO_LATER__
39815 +       /*
39816 +        * Some pages can be pinned, but some are not. Unpinning such pages 
39817 +        * triggers BUG(). 
39818 +        */
39819 +       unsigned long addr;
39820 +
39821 +       addr = (unsigned long)(&__init_begin);
39822 +       for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
39823 +               ClearPageReserved(virt_to_page(addr));
39824 +               set_page_count(virt_to_page(addr), 1);
39825 +               memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
39826 +               make_page_writable(
39827 +                       __va(__pa(addr)), XENFEAT_writable_page_tables);
39828 +               /*
39829 +                * Make pages from __PAGE_OFFSET address as well
39830 +                */
39831 +               make_page_writable(
39832 +                       (void *)addr, XENFEAT_writable_page_tables);
39833 +               free_page(addr);
39834 +               totalram_pages++;
39835 +       }
39836 +       memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
39837 +       printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
39838 +#endif
39839 +}
39840 +
39841 +#ifdef CONFIG_DEBUG_RODATA
39842 +
39843 +extern char __start_rodata, __end_rodata;
39844 +void mark_rodata_ro(void)
39845 +{
39846 +       unsigned long addr = (unsigned long)&__start_rodata;
39847 +
39848 +       for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
39849 +               change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
39850 +
39851 +       printk ("Write protecting the kernel read-only data: %luk\n",
39852 +                       (&__end_rodata - &__start_rodata) >> 10);
39853 +
39854 +       /*
39855 +        * change_page_attr_addr() requires a global_flush_tlb() call after it.
39856 +        * We do this after the printk so that if something went wrong in the
39857 +        * change, the printk gets out at least to give a better debug hint
39858 +        * of who is the culprit.
39859 +        */
39860 +       global_flush_tlb();
39861 +}
39862 +#endif
39863 +
39864 +#ifdef CONFIG_BLK_DEV_INITRD
39865 +void free_initrd_mem(unsigned long start, unsigned long end)
39866 +{
39867 +       if (start >= end)
39868 +               return;
39869 +       printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
39870 +       for (; start < end; start += PAGE_SIZE) {
39871 +               ClearPageReserved(virt_to_page(start));
39872 +               set_page_count(virt_to_page(start), 1);
39873 +               free_page(start);
39874 +               totalram_pages++;
39875 +       }
39876 +}
39877 +#endif
39878 +
39879 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
39880 +{ 
39881 +       /* Should check here against the e820 map to avoid double free */ 
39882 +#ifdef CONFIG_NUMA
39883 +       int nid = phys_to_nid(phys);
39884 +       reserve_bootmem_node(NODE_DATA(nid), phys, len);
39885 +#else                  
39886 +       reserve_bootmem(phys, len);    
39887 +#endif
39888 +       if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
39889 +               dma_reserve += len / PAGE_SIZE;
39890 +}
39891 +
39892 +int kern_addr_valid(unsigned long addr) 
39893 +{ 
39894 +       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
39895 +       pgd_t *pgd;
39896 +       pud_t *pud;
39897 +       pmd_t *pmd;
39898 +       pte_t *pte;
39899 +
39900 +       if (above != 0 && above != -1UL)
39901 +               return 0; 
39902 +       
39903 +       pgd = pgd_offset_k(addr);
39904 +       if (pgd_none(*pgd))
39905 +               return 0;
39906 +
39907 +       pud = pud_offset_k(pgd, addr);
39908 +       if (pud_none(*pud))
39909 +               return 0; 
39910 +
39911 +       pmd = pmd_offset(pud, addr);
39912 +       if (pmd_none(*pmd))
39913 +               return 0;
39914 +       if (pmd_large(*pmd))
39915 +               return pfn_valid(pmd_pfn(*pmd));
39916 +
39917 +       pte = pte_offset_kernel(pmd, addr);
39918 +       if (pte_none(*pte))
39919 +               return 0;
39920 +       return pfn_valid(pte_pfn(*pte));
39921 +}
39922 +
39923 +#ifdef CONFIG_SYSCTL
39924 +#include <linux/sysctl.h>
39925 +
39926 +extern int exception_trace, page_fault_trace;
39927 +
39928 +static ctl_table debug_table2[] = {
39929 +       { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
39930 +         proc_dointvec },
39931 +       { 0, }
39932 +}; 
39933 +
39934 +static ctl_table debug_root_table2[] = { 
39935 +       { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
39936 +          .child = debug_table2 }, 
39937 +       { 0 }, 
39938 +}; 
39939 +
39940 +static __init int x8664_sysctl_init(void)
39941 +{ 
39942 +       register_sysctl_table(debug_root_table2, 1);
39943 +       return 0;
39944 +}
39945 +__initcall(x8664_sysctl_init);
39946 +#endif
39947 +
39948 +/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
39949 +   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
39950 +   not need special handling anymore. */
39951 +
39952 +static struct vm_area_struct gate_vma = {
39953 +       .vm_start = VSYSCALL_START,
39954 +       .vm_end = VSYSCALL_END,
39955 +       .vm_page_prot = PAGE_READONLY
39956 +};
39957 +
39958 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
39959 +{
39960 +#ifdef CONFIG_IA32_EMULATION
39961 +       if (test_tsk_thread_flag(tsk, TIF_IA32))
39962 +               return NULL;
39963 +#endif
39964 +       return &gate_vma;
39965 +}
39966 +
39967 +int in_gate_area(struct task_struct *task, unsigned long addr)
39968 +{
39969 +       struct vm_area_struct *vma = get_gate_vma(task);
39970 +       if (!vma)
39971 +               return 0;
39972 +       return (addr >= vma->vm_start) && (addr < vma->vm_end);
39973 +}
39974 +
39975 +/* Use this when you have no reliable task/vma, typically from interrupt
39976 + * context.  It is less reliable than using the task's vma and may give
39977 + * false positives.
39978 + */
39979 +int in_gate_area_no_task(unsigned long addr)
39980 +{
39981 +       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
39982 +}
39983 +
39984 +/*
39985 + * Local variables:
39986 + *  c-file-style: "linux"
39987 + *  indent-tabs-mode: t
39988 + *  c-indent-level: 8
39989 + *  c-basic-offset: 8
39990 + *  tab-width: 8
39991 + * End:
39992 + */
39993 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/mm/Makefile tmp-linux-2.6-xen.patch/arch/x86_64/mm/Makefile
39994 --- ref-linux-2.6.16.9/arch/x86_64/mm/Makefile  2006-04-19 08:10:14.000000000 +0200
39995 +++ tmp-linux-2.6-xen.patch/arch/x86_64/mm/Makefile     2006-04-10 00:05:52.000000000 +0200
39996 @@ -9,3 +9,13 @@ obj-$(CONFIG_K8_NUMA) += k8topology.o
39997  obj-$(CONFIG_ACPI_NUMA) += srat.o
39998  
39999  hugetlbpage-y = ../../i386/mm/hugetlbpage.o
40000 +
40001 +ifdef CONFIG_XEN
40002 +include $(srctree)/scripts/Makefile.xen
40003 +
40004 +ioremap-y      += ../../i386/mm/ioremap-xen.o
40005 +hypervisor-y   += ../../i386/mm/hypervisor.o
40006 +obj-y          += hypervisor.o
40007 +
40008 +obj-y := $(call cherrypickxen, $(obj-y))
40009 +endif
40010 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/mm/pageattr-xen.c tmp-linux-2.6-xen.patch/arch/x86_64/mm/pageattr-xen.c
40011 --- ref-linux-2.6.16.9/arch/x86_64/mm/pageattr-xen.c    1970-01-01 01:00:00.000000000 +0100
40012 +++ tmp-linux-2.6-xen.patch/arch/x86_64/mm/pageattr-xen.c       2006-04-10 00:05:52.000000000 +0200
40013 @@ -0,0 +1,401 @@
40014 +/* 
40015 + * Copyright 2002 Andi Kleen, SuSE Labs. 
40016 + * Thanks to Ben LaHaise for precious feedback.
40017 + */ 
40018 +
40019 +#include <linux/config.h>
40020 +#include <linux/mm.h>
40021 +#include <linux/sched.h>
40022 +#include <linux/highmem.h>
40023 +#include <linux/module.h>
40024 +#include <linux/slab.h>
40025 +#include <asm/uaccess.h>
40026 +#include <asm/processor.h>
40027 +#include <asm/tlbflush.h>
40028 +#include <asm/io.h>
40029 +
40030 +#ifdef CONFIG_XEN
40031 +#include <asm/pgalloc.h>
40032 +#include <asm/mmu_context.h>
40033 +
40034 +LIST_HEAD(mm_unpinned);
40035 +DEFINE_SPINLOCK(mm_unpinned_lock);
40036 +
40037 +static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
40038 +{
40039 +       struct page *page = virt_to_page(pt);
40040 +       unsigned long pfn = page_to_pfn(page);
40041 +
40042 +       BUG_ON(HYPERVISOR_update_va_mapping(
40043 +                      (unsigned long)__va(pfn << PAGE_SHIFT),
40044 +                      pfn_pte(pfn, flags), 0));
40045 +}
40046 +
40047 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
40048 +{
40049 +       pgd_t       *pgd;
40050 +       pud_t       *pud;
40051 +       pmd_t       *pmd;
40052 +       pte_t       *pte;
40053 +       int          g,u,m;
40054 +
40055 +       pgd = mm->pgd;
40056 +       /*
40057 +        * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
40058 +        * be the 'current' task's pagetables (e.g., current may be 32-bit,
40059 +        * but the pagetables may be for a 64-bit task).
40060 +        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
40061 +        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
40062 +        */
40063 +       for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
40064 +               if (pgd_none(*pgd))
40065 +                       continue;
40066 +               pud = pud_offset(pgd, 0);
40067 +               if (PTRS_PER_PUD > 1) /* not folded */ 
40068 +                       mm_walk_set_prot(pud,flags);
40069 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
40070 +                       if (pud_none(*pud))
40071 +                               continue;
40072 +                       pmd = pmd_offset(pud, 0);
40073 +                       if (PTRS_PER_PMD > 1) /* not folded */ 
40074 +                               mm_walk_set_prot(pmd,flags);
40075 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
40076 +                               if (pmd_none(*pmd))
40077 +                                       continue;
40078 +                               pte = pte_offset_kernel(pmd,0);
40079 +                               mm_walk_set_prot(pte,flags);
40080 +                       }
40081 +               }
40082 +       }
40083 +}
40084 +
40085 +void mm_pin(struct mm_struct *mm)
40086 +{
40087 +       if (xen_feature(XENFEAT_writable_page_tables))
40088 +               return;
40089 +
40090 +       spin_lock(&mm->page_table_lock);
40091 +
40092 +       mm_walk(mm, PAGE_KERNEL_RO);
40093 +       BUG_ON(HYPERVISOR_update_va_mapping(
40094 +                      (unsigned long)mm->pgd,
40095 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
40096 +                      UVMF_TLB_FLUSH));
40097 +       BUG_ON(HYPERVISOR_update_va_mapping(
40098 +                      (unsigned long)__user_pgd(mm->pgd),
40099 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO),
40100 +                      UVMF_TLB_FLUSH));
40101 +       xen_pgd_pin(__pa(mm->pgd)); /* kernel */
40102 +       xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
40103 +       mm->context.pinned = 1;
40104 +       spin_lock(&mm_unpinned_lock);
40105 +       list_del(&mm->context.unpinned);
40106 +       spin_unlock(&mm_unpinned_lock);
40107 +
40108 +       spin_unlock(&mm->page_table_lock);
40109 +}
40110 +
40111 +void mm_unpin(struct mm_struct *mm)
40112 +{
40113 +       if (xen_feature(XENFEAT_writable_page_tables))
40114 +               return;
40115 +
40116 +       spin_lock(&mm->page_table_lock);
40117 +
40118 +       xen_pgd_unpin(__pa(mm->pgd));
40119 +       xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
40120 +       BUG_ON(HYPERVISOR_update_va_mapping(
40121 +                      (unsigned long)mm->pgd,
40122 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
40123 +       BUG_ON(HYPERVISOR_update_va_mapping(
40124 +                      (unsigned long)__user_pgd(mm->pgd),
40125 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0));
40126 +       mm_walk(mm, PAGE_KERNEL);
40127 +       xen_tlb_flush();
40128 +       mm->context.pinned = 0;
40129 +       spin_lock(&mm_unpinned_lock);
40130 +       list_add(&mm->context.unpinned, &mm_unpinned);
40131 +       spin_unlock(&mm_unpinned_lock);
40132 +
40133 +       spin_unlock(&mm->page_table_lock);
40134 +}
40135 +
40136 +void mm_pin_all(void)
40137 +{
40138 +       if (xen_feature(XENFEAT_writable_page_tables))
40139 +               return;
40140 +
40141 +       while (!list_empty(&mm_unpinned))       
40142 +               mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
40143 +                                 context.unpinned));
40144 +}
40145 +
40146 +void _arch_exit_mmap(struct mm_struct *mm)
40147 +{
40148 +    struct task_struct *tsk = current;
40149 +
40150 +    task_lock(tsk);
40151 +
40152 +    /*
40153 +     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
40154 +     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
40155 +     */
40156 +    if ( tsk->active_mm == mm )
40157 +    {
40158 +        tsk->active_mm = &init_mm;
40159 +        atomic_inc(&init_mm.mm_count);
40160 +
40161 +        switch_mm(mm, &init_mm, tsk);
40162 +
40163 +        atomic_dec(&mm->mm_count);
40164 +        BUG_ON(atomic_read(&mm->mm_count) == 0);
40165 +    }
40166 +
40167 +    task_unlock(tsk);
40168 +
40169 +    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
40170 +        mm_unpin(mm);
40171 +}
40172 +
40173 +void pte_free(struct page *pte)
40174 +{
40175 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
40176 +
40177 +       if (!pte_write(*virt_to_ptep(va)))
40178 +               BUG_ON(HYPERVISOR_update_va_mapping(
40179 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
40180 +       __free_page(pte);
40181 +}
40182 +#endif /* CONFIG_XEN */
40183 +
40184 +static inline pte_t *lookup_address(unsigned long address) 
40185 +{ 
40186 +       pgd_t *pgd = pgd_offset_k(address);
40187 +       pud_t *pud;
40188 +       pmd_t *pmd;
40189 +       pte_t *pte;
40190 +       if (pgd_none(*pgd))
40191 +               return NULL;
40192 +       pud = pud_offset(pgd, address);
40193 +       if (!pud_present(*pud))
40194 +               return NULL; 
40195 +       pmd = pmd_offset(pud, address);
40196 +       if (!pmd_present(*pmd))
40197 +               return NULL; 
40198 +       if (pmd_large(*pmd))
40199 +               return (pte_t *)pmd;
40200 +       pte = pte_offset_kernel(pmd, address);
40201 +       if (pte && !pte_present(*pte))
40202 +               pte = NULL; 
40203 +       return pte;
40204 +} 
40205 +
40206 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
40207 +                                    pgprot_t ref_prot)
40208 +{ 
40209 +       int i; 
40210 +       unsigned long addr;
40211 +       struct page *base = alloc_pages(GFP_KERNEL, 0);
40212 +       pte_t *pbase;
40213 +       if (!base) 
40214 +               return NULL;
40215 +       address = __pa(address);
40216 +       addr = address & LARGE_PAGE_MASK; 
40217 +       pbase = (pte_t *)page_address(base);
40218 +       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
40219 +               pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
40220 +                                  addr == address ? prot : ref_prot);
40221 +       }
40222 +       return base;
40223 +} 
40224 +
40225 +
40226 +static void flush_kernel_map(void *address) 
40227 +{
40228 +       if (0 && address && cpu_has_clflush) {
40229 +               /* is this worth it? */ 
40230 +               int i;
40231 +               for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 
40232 +                       asm volatile("clflush (%0)" :: "r" (address + i)); 
40233 +       } else
40234 +               asm volatile("wbinvd":::"memory"); 
40235 +       if (address)
40236 +               __flush_tlb_one(address);
40237 +       else
40238 +               __flush_tlb_all();
40239 +}
40240 +
40241 +
40242 +static inline void flush_map(unsigned long address)
40243 +{      
40244 +       on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
40245 +}
40246 +
40247 +struct deferred_page { 
40248 +       struct deferred_page *next; 
40249 +       struct page *fpage;
40250 +       unsigned long address;
40251 +}; 
40252 +static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
40253 +
40254 +static inline void save_page(unsigned long address, struct page *fpage)
40255 +{
40256 +       struct deferred_page *df;
40257 +       df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); 
40258 +       if (!df) {
40259 +               flush_map(address);
40260 +               __free_page(fpage);
40261 +       } else { 
40262 +               df->next = df_list;
40263 +               df->fpage = fpage;
40264 +               df->address = address;
40265 +               df_list = df;
40266 +       }                       
40267 +}
40268 +
40269 +/* 
40270 + * No more special protections in this 2/4MB area - revert to a
40271 + * large page again. 
40272 + */
40273 +static void revert_page(unsigned long address, pgprot_t ref_prot)
40274 +{
40275 +       pgd_t *pgd;
40276 +       pud_t *pud;
40277 +       pmd_t *pmd;
40278 +       pte_t large_pte;
40279 +
40280 +       pgd = pgd_offset_k(address);
40281 +       BUG_ON(pgd_none(*pgd));
40282 +       pud = pud_offset(pgd,address);
40283 +       BUG_ON(pud_none(*pud));
40284 +       pmd = pmd_offset(pud, address);
40285 +       BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
40286 +       pgprot_val(ref_prot) |= _PAGE_PSE;
40287 +       large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
40288 +       set_pte((pte_t *)pmd, large_pte);
40289 +}      
40290 +
40291 +static int
40292 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
40293 +                                  pgprot_t ref_prot)
40294 +{ 
40295 +       pte_t *kpte; 
40296 +       struct page *kpte_page;
40297 +       unsigned kpte_flags;
40298 +       pgprot_t ref_prot2;
40299 +       kpte = lookup_address(address);
40300 +       if (!kpte) return 0;
40301 +       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
40302 +       kpte_flags = pte_val(*kpte); 
40303 +       if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
40304 +               if ((kpte_flags & _PAGE_PSE) == 0) { 
40305 +                       set_pte(kpte, pfn_pte(pfn, prot));
40306 +               } else {
40307 +                       /*
40308 +                        * split_large_page will take the reference for this change_page_attr
40309 +                        * on the split page.
40310 +                        */
40311 +
40312 +                       struct page *split;
40313 +                       ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
40314 +
40315 +                       split = split_large_page(address, prot, ref_prot2);
40316 +                       if (!split)
40317 +                               return -ENOMEM;
40318 +                       set_pte(kpte,mk_pte(split, ref_prot2));
40319 +                       kpte_page = split;
40320 +               }       
40321 +               get_page(kpte_page);
40322 +       } else if ((kpte_flags & _PAGE_PSE) == 0) { 
40323 +               set_pte(kpte, pfn_pte(pfn, ref_prot));
40324 +               __put_page(kpte_page);
40325 +       } else
40326 +               BUG();
40327 +
40328 +       /* on x86-64 the direct mapping set at boot is not using 4k pages */
40329 +       /*
40330 +        * ..., but the XEN guest kernels (currently) do:
40331 +        * If the pte was reserved, it means it was created at boot
40332 +        * time (not via split_large_page) and in turn we must not
40333 +        * replace it with a large page.
40334 +        */
40335 +#ifndef CONFIG_XEN
40336 +       BUG_ON(PageReserved(kpte_page));
40337 +#else
40338 +       if (!PageReserved(kpte_page))
40339 +#endif
40340 +               switch (page_count(kpte_page)) {
40341 +               case 1:
40342 +                       save_page(address, kpte_page);               
40343 +                       revert_page(address, ref_prot);
40344 +                       break;
40345 +               case 0:
40346 +                       BUG(); /* memleak and failed 2M page regeneration */
40347 +               }
40348 +       return 0;
40349 +} 
40350 +
40351 +/*
40352 + * Change the page attributes of an page in the linear mapping.
40353 + *
40354 + * This should be used when a page is mapped with a different caching policy
40355 + * than write-back somewhere - some CPUs do not like it when mappings with
40356 + * different caching policies exist. This changes the page attributes of the
40357 + * in kernel linear mapping too.
40358 + * 
40359 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
40360 + * This function only deals with the kernel linear map.
40361 + * 
40362 + * Caller must call global_flush_tlb() after this.
40363 + */
40364 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
40365 +{
40366 +       int err = 0; 
40367 +       int i; 
40368 +
40369 +       down_write(&init_mm.mmap_sem);
40370 +       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
40371 +               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
40372 +
40373 +               err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
40374 +               if (err) 
40375 +                       break; 
40376 +               /* Handle kernel mapping too which aliases part of the
40377 +                * lowmem */
40378 +               if (__pa(address) < KERNEL_TEXT_SIZE) {
40379 +                       unsigned long addr2;
40380 +                       pgprot_t prot2 = prot;
40381 +                       addr2 = __START_KERNEL_map + __pa(address);
40382 +                       pgprot_val(prot2) &= ~_PAGE_NX;
40383 +                       err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
40384 +               } 
40385 +       }       
40386 +       up_write(&init_mm.mmap_sem); 
40387 +       return err;
40388 +}
40389 +
40390 +/* Don't call this for MMIO areas that may not have a mem_map entry */
40391 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
40392 +{
40393 +       unsigned long addr = (unsigned long)page_address(page);
40394 +       return change_page_attr_addr(addr, numpages, prot);
40395 +}
40396 +
40397 +void global_flush_tlb(void)
40398 +{ 
40399 +       struct deferred_page *df, *next_df;
40400 +
40401 +       down_read(&init_mm.mmap_sem);
40402 +       df = xchg(&df_list, NULL);
40403 +       up_read(&init_mm.mmap_sem);
40404 +       flush_map((df && !df->next) ? df->address : 0);
40405 +       for (; df; df = next_df) { 
40406 +               next_df = df->next;
40407 +               if (df->fpage) 
40408 +                       __free_page(df->fpage);
40409 +               kfree(df);
40410 +       } 
40411 +} 
40412 +
40413 +EXPORT_SYMBOL(change_page_attr);
40414 +EXPORT_SYMBOL(global_flush_tlb);
40415 diff -Nurp ref-linux-2.6.16.9/arch/x86_64/pci/Makefile tmp-linux-2.6-xen.patch/arch/x86_64/pci/Makefile
40416 --- ref-linux-2.6.16.9/arch/x86_64/pci/Makefile 2006-04-19 08:10:14.000000000 +0200
40417 +++ tmp-linux-2.6-xen.patch/arch/x86_64/pci/Makefile    2006-04-10 00:05:52.000000000 +0200
40418 @@ -15,10 +15,22 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
40419  
40420  obj-$(CONFIG_NUMA)     += k8-bus.o
40421  
40422 +# pcifront should be after mmconfig.o and direct.o as it should only
40423 +# take over if direct access to the PCI bus is unavailable
40424 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
40425 +
40426  direct-y += ../../i386/pci/direct.o
40427  acpi-y   += ../../i386/pci/acpi.o
40428 +pcifront-y += ../../i386/pci/pcifront.o
40429  legacy-y += ../../i386/pci/legacy.o
40430  irq-y    += ../../i386/pci/irq.o
40431  common-y += ../../i386/pci/common.o
40432  fixup-y  += ../../i386/pci/fixup.o
40433  i386-y  += ../../i386/pci/i386.o
40434 +
40435 +ifdef CONFIG_XEN
40436 +irq-y          := ../../i386/pci/irq-xen.o
40437 +include $(srctree)/scripts/Makefile.xen
40438 +
40439 +obj-y := $(call cherrypickxen, $(obj-y))
40440 +endif
40441 diff -Nurp ref-linux-2.6.16.9/drivers/acpi/Kconfig tmp-linux-2.6-xen.patch/drivers/acpi/Kconfig
40442 --- ref-linux-2.6.16.9/drivers/acpi/Kconfig     2006-04-19 08:10:14.000000000 +0200
40443 +++ tmp-linux-2.6-xen.patch/drivers/acpi/Kconfig        2006-04-10 00:05:52.000000000 +0200
40444 @@ -46,7 +46,7 @@ if ACPI
40445  
40446  config ACPI_SLEEP
40447         bool "Sleep States"
40448 -       depends on X86 && (!SMP || SUSPEND_SMP)
40449 +       depends on X86 && (!SMP || SUSPEND_SMP) && !XEN
40450         depends on PM
40451         default y
40452         ---help---
40453 @@ -287,6 +287,7 @@ config ACPI_SYSTEM
40454  config X86_PM_TIMER
40455         bool "Power Management Timer Support" if EMBEDDED
40456         depends on X86
40457 +       depends on !XEN
40458         default y
40459         help
40460           The Power Management Timer is available on all ACPI-capable,
40461 diff -Nurp ref-linux-2.6.16.9/drivers/acpi/tables.c tmp-linux-2.6-xen.patch/drivers/acpi/tables.c
40462 --- ref-linux-2.6.16.9/drivers/acpi/tables.c    2006-04-19 08:10:14.000000000 +0200
40463 +++ tmp-linux-2.6-xen.patch/drivers/acpi/tables.c       2006-04-10 00:05:52.000000000 +0200
40464 @@ -572,6 +572,11 @@ static int __init acpi_table_get_sdt(str
40465   * 
40466   * result: sdt_entry[] is initialized
40467   */
40468 +#if defined(CONFIG_X86_XEN) || defined(CONFIG_X86_64_XEN)
40469 +#define acpi_rsdp_phys_to_va(rsdp_phys) isa_bus_to_virt(rsdp_phys)
40470 +#else
40471 +#define acpi_rsdp_phys_to_va(rsdp_phys) __va(rsdp_phys)
40472 +#endif
40473  
40474  int __init acpi_table_init(void)
40475  {
40476 @@ -587,7 +592,7 @@ int __init acpi_table_init(void)
40477                 return -ENODEV;
40478         }
40479  
40480 -       rsdp = (struct acpi_table_rsdp *)__va(rsdp_phys);
40481 +       rsdp = (struct acpi_table_rsdp *)acpi_rsdp_phys_to_va(rsdp_phys);
40482         if (!rsdp) {
40483                 printk(KERN_WARNING PREFIX "Unable to map RSDP\n");
40484                 return -ENODEV;
40485 diff -Nurp ref-linux-2.6.16.9/drivers/char/mem.c tmp-linux-2.6-xen.patch/drivers/char/mem.c
40486 --- ref-linux-2.6.16.9/drivers/char/mem.c       2006-04-19 08:10:14.000000000 +0200
40487 +++ tmp-linux-2.6-xen.patch/drivers/char/mem.c  2006-04-10 00:05:52.000000000 +0200
40488 @@ -108,6 +108,7 @@ static inline int valid_mmap_phys_addr_r
40489  }
40490  #endif
40491  
40492 +#ifndef ARCH_HAS_DEV_MEM
40493  /*
40494   * This funcion reads the *physical* memory. The f_pos points directly to the 
40495   * memory location. 
40496 @@ -232,6 +233,7 @@ static ssize_t write_mem(struct file * f
40497         *ppos += written;
40498         return written;
40499  }
40500 +#endif
40501  
40502  #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
40503  static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
40504 @@ -773,6 +775,7 @@ static int open_port(struct inode * inod
40505  #define open_kmem      open_mem
40506  #define open_oldmem    open_mem
40507  
40508 +#ifndef ARCH_HAS_DEV_MEM
40509  static struct file_operations mem_fops = {
40510         .llseek         = memory_lseek,
40511         .read           = read_mem,
40512 @@ -780,6 +783,9 @@ static struct file_operations mem_fops =
40513         .mmap           = mmap_mem,
40514         .open           = open_mem,
40515  };
40516 +#else
40517 +extern struct file_operations mem_fops;
40518 +#endif
40519  
40520  static struct file_operations kmem_fops = {
40521         .llseek         = memory_lseek,
40522 diff -Nurp ref-linux-2.6.16.9/drivers/char/tpm/Kconfig tmp-linux-2.6-xen.patch/drivers/char/tpm/Kconfig
40523 --- ref-linux-2.6.16.9/drivers/char/tpm/Kconfig 2006-04-19 08:10:14.000000000 +0200
40524 +++ tmp-linux-2.6-xen.patch/drivers/char/tpm/Kconfig    2006-04-10 00:05:52.000000000 +0200
40525 @@ -22,7 +22,7 @@ config TCG_TPM
40526  
40527  config TCG_NSC
40528         tristate "National Semiconductor TPM Interface"
40529 -       depends on TCG_TPM
40530 +       depends on TCG_TPM && !XEN_UNPRIVILEGED_GUEST
40531         ---help---
40532           If you have a TPM security chip from National Semicondutor 
40533           say Yes and it will be accessible from within Linux.  To 
40534 @@ -31,7 +31,7 @@ config TCG_NSC
40535  
40536  config TCG_ATMEL
40537         tristate "Atmel TPM Interface"
40538 -       depends on TCG_TPM
40539 +       depends on TCG_TPM && !XEN_UNPRIVILEGED_GUEST
40540         ---help---
40541           If you have a TPM security chip from Atmel say Yes and it 
40542           will be accessible from within Linux.  To compile this driver 
40543 @@ -49,5 +49,15 @@ config TCG_INFINEON
40544           Further information on this driver and the supported hardware
40545           can be found at http://www.prosec.rub.de/tpm
40546  
40547 +config TCG_XEN
40548 +       tristate "XEN TPM Interface"
40549 +       depends on TCG_TPM && XEN && XEN_TPMDEV_FRONTEND
40550 +       ---help---
40551 +         If you want to make TPM support available to a Xen
40552 +         user domain, say Yes and it will
40553 +          be accessible from within Linux. To compile this driver
40554 +          as a module, choose M here; the module will be called
40555 +          tpm_xen.
40556 +
40557  endmenu
40558  
40559 diff -Nurp ref-linux-2.6.16.9/drivers/char/tpm/Makefile tmp-linux-2.6-xen.patch/drivers/char/tpm/Makefile
40560 --- ref-linux-2.6.16.9/drivers/char/tpm/Makefile        2006-04-19 08:10:14.000000000 +0200
40561 +++ tmp-linux-2.6-xen.patch/drivers/char/tpm/Makefile   2006-04-10 00:05:52.000000000 +0200
40562 @@ -8,3 +8,4 @@ endif
40563  obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
40564  obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
40565  obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
40566 +obj-$(CONFIG_TCG_XEN) += tpm_xen.o
40567 diff -Nurp ref-linux-2.6.16.9/drivers/char/tpm/tpm.c tmp-linux-2.6-xen.patch/drivers/char/tpm/tpm.c
40568 --- ref-linux-2.6.16.9/drivers/char/tpm/tpm.c   2006-04-19 08:10:14.000000000 +0200
40569 +++ tmp-linux-2.6-xen.patch/drivers/char/tpm/tpm.c      2006-04-10 00:05:52.000000000 +0200
40570 @@ -30,7 +30,8 @@
40571  
40572  enum tpm_const {
40573         TPM_MINOR = 224,        /* officially assigned */
40574 -       TPM_BUFSIZE = 2048,
40575 +       TPM_MIN_BUFSIZE = 2048,
40576 +       TPM_MAX_BUFSIZE = 64 * 1024,
40577         TPM_NUM_DEVICES = 256,
40578         TPM_NUM_MASK_ENTRIES = TPM_NUM_DEVICES / (8 * sizeof(int))
40579  };
40580 @@ -52,14 +53,14 @@ static void timeout_work(void * ptr)
40581  
40582         down(&chip->buffer_mutex);
40583         atomic_set(&chip->data_pending, 0);
40584 -       memset(chip->data_buffer, 0, TPM_BUFSIZE);
40585 +       memset(chip->data_buffer, 0, get_chip_buffersize(chip));
40586         up(&chip->buffer_mutex);
40587  }
40588  
40589  /*
40590   * Internal kernel interface to transmit TPM commands
40591   */
40592 -static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf,
40593 +static ssize_t tpm_transmit(struct tpm_chip * chip, const char *buf,
40594                             size_t bufsiz)
40595  {
40596         ssize_t rc;
40597 @@ -351,7 +352,7 @@ int tpm_open(struct inode *inode, struct
40598  
40599         spin_unlock(&driver_lock);
40600  
40601 -       chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL);
40602 +       chip->data_buffer = kmalloc(get_chip_buffersize(chip) * sizeof(u8), GFP_KERNEL);
40603         if (chip->data_buffer == NULL) {
40604                 chip->num_opens--;
40605                 put_device(chip->dev);
40606 @@ -399,8 +400,8 @@ ssize_t tpm_write(struct file *file, con
40607  
40608         down(&chip->buffer_mutex);
40609  
40610 -       if (in_size > TPM_BUFSIZE)
40611 -               in_size = TPM_BUFSIZE;
40612 +       if (in_size > get_chip_buffersize(chip))
40613 +               in_size = get_chip_buffersize(chip);
40614  
40615         if (copy_from_user
40616             (chip->data_buffer, (void __user *) buf, in_size)) {
40617 @@ -409,9 +410,11 @@ ssize_t tpm_write(struct file *file, con
40618         }
40619  
40620         /* atomic tpm command send and result receive */
40621 -       out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE);
40622 +       out_size = tpm_transmit(chip, chip->data_buffer, 
40623 +                               get_chip_buffersize(chip));
40624  
40625         atomic_set(&chip->data_pending, out_size);
40626 +       atomic_set(&chip->data_position, 0);
40627         up(&chip->buffer_mutex);
40628  
40629         /* Set a timeout by which the reader must come claim the result */
40630 @@ -427,20 +430,33 @@ ssize_t tpm_read(struct file * file, cha
40631  {
40632         struct tpm_chip *chip = file->private_data;
40633         int ret_size;
40634 +       int pos, pending = 0;
40635  
40636 -       del_singleshot_timer_sync(&chip->user_read_timer);
40637 -       flush_scheduled_work();
40638         ret_size = atomic_read(&chip->data_pending);
40639 -       atomic_set(&chip->data_pending, 0);
40640         if (ret_size > 0) {     /* relay data */
40641                 if (size < ret_size)
40642                         ret_size = size;
40643  
40644 +               pos = atomic_read(&chip->data_position);
40645 +
40646                 down(&chip->buffer_mutex);
40647 -               if (copy_to_user(buf, chip->data_buffer, ret_size))
40648 +               if (copy_to_user(buf, &chip->data_buffer[pos], ret_size)) {
40649                         ret_size = -EFAULT;
40650 +               } else {
40651 +                       pending = atomic_read(&chip->data_pending) - ret_size;
40652 +                       if ( pending ) {
40653 +                               atomic_set( &chip->data_pending, pending );
40654 +                               atomic_set( &chip->data_position, pos+ret_size );
40655 +                       }
40656 +               }
40657                 up(&chip->buffer_mutex);
40658         }
40659 +       
40660 +       if ( ret_size <= 0 || pending == 0 ) {
40661 +               atomic_set( &chip->data_pending, 0 );
40662 +               del_singleshot_timer_sync(&chip->user_read_timer);
40663 +               flush_scheduled_work();
40664 +       }
40665  
40666         return ret_size;
40667  }
40668 @@ -544,6 +560,12 @@ int tpm_register_hardware(struct device 
40669         chip->user_read_timer.data = (unsigned long) chip;
40670  
40671         chip->vendor = entry;
40672 +       
40673 +       if (entry->buffersize < TPM_MIN_BUFSIZE) {
40674 +               entry->buffersize = TPM_MIN_BUFSIZE;
40675 +       } else if (entry->buffersize > TPM_MAX_BUFSIZE) {
40676 +               entry->buffersize = TPM_MAX_BUFSIZE;
40677 +       }
40678  
40679         chip->dev_num = -1;
40680  
40681 diff -Nurp ref-linux-2.6.16.9/drivers/char/tpm/tpm.h tmp-linux-2.6-xen.patch/drivers/char/tpm/tpm.h
40682 --- ref-linux-2.6.16.9/drivers/char/tpm/tpm.h   2006-04-19 08:10:14.000000000 +0200
40683 +++ tmp-linux-2.6-xen.patch/drivers/char/tpm/tpm.h      2006-04-10 00:05:52.000000000 +0200
40684 @@ -50,6 +50,7 @@ struct tpm_vendor_specific {
40685         u8 req_complete_mask;
40686         u8 req_complete_val;
40687         u8 req_canceled;
40688 +       u32 buffersize;
40689         void __iomem *iobase;           /* ioremapped address */
40690         unsigned long base;             /* TPM base address */
40691  
40692 @@ -74,6 +75,7 @@ struct tpm_chip {
40693         /* Data passed to and from the tpm via the read/write calls */
40694         u8 *data_buffer;
40695         atomic_t data_pending;
40696 +       atomic_t data_position;
40697         struct semaphore buffer_mutex;
40698  
40699         struct timer_list user_read_timer;      /* user needs to claim result */
40700 @@ -99,6 +101,11 @@ static inline void tpm_write_index(int b
40701         outb(value & 0xFF, base+1);
40702  }
40703  
40704 +static inline u32 get_chip_buffersize(struct tpm_chip *chip)
40705 +{
40706 +       return chip->vendor->buffersize;
40707 +}
40708 +
40709  extern int tpm_register_hardware(struct device *,
40710                                  struct tpm_vendor_specific *);
40711  extern int tpm_open(struct inode *, struct file *);
40712 diff -Nurp ref-linux-2.6.16.9/drivers/char/tpm/tpm_xen.c tmp-linux-2.6-xen.patch/drivers/char/tpm/tpm_xen.c
40713 --- ref-linux-2.6.16.9/drivers/char/tpm/tpm_xen.c       1970-01-01 01:00:00.000000000 +0100
40714 +++ tmp-linux-2.6-xen.patch/drivers/char/tpm/tpm_xen.c  2006-04-10 00:05:52.000000000 +0200
40715 @@ -0,0 +1,536 @@
40716 +/*
40717 + * Copyright (C) 2004 IBM Corporation
40718 + *
40719 + * Authors:
40720 + * Leendert van Doorn <leendert@watson.ibm.com>
40721 + * Dave Safford <safford@watson.ibm.com>
40722 + * Reiner Sailer <sailer@watson.ibm.com>
40723 + * Kylene Hall <kjhall@us.ibm.com>
40724 + * Stefan Berger <stefanb@us.ibm.com>
40725 + *
40726 + * Maintained by: <tpmdd_devel@lists.sourceforge.net>
40727 + *
40728 + * Device driver for TCG/TCPA TPM (trusted platform module) for XEN.
40729 + * Specifications at www.trustedcomputinggroup.org
40730 + *
40731 + * This program is free software; you can redistribute it and/or
40732 + * modify it under the terms of the GNU General Public License as
40733 + * published by the Free Software Foundation, version 2 of the
40734 + * License.
40735 + *
40736 + */
40737 +
40738 +#include <asm/uaccess.h>
40739 +#include <linux/list.h>
40740 +#include <xen/tpmfe.h>
40741 +#include <linux/device.h>
40742 +#include <linux/interrupt.h>
40743 +#include <linux/platform_device.h>
40744 +#include "tpm.h"
40745 +
40746 +/* read status bits */
40747 +enum {
40748 +       STATUS_BUSY = 0x01,
40749 +       STATUS_DATA_AVAIL = 0x02,
40750 +       STATUS_READY = 0x04
40751 +};
40752 +
40753 +#define MIN(x,y)  ((x) < (y)) ? (x) : (y)
40754 +
40755 +struct transmission {
40756 +       struct list_head next;
40757 +       unsigned char *request;
40758 +       unsigned int request_len;
40759 +       unsigned char *rcv_buffer;
40760 +       unsigned int  buffersize;
40761 +       unsigned int flags;
40762 +};
40763 +
40764 +enum {
40765 +       TRANSMISSION_FLAG_WAS_QUEUED = 0x1
40766 +};
40767 +
40768 +struct data_exchange {
40769 +       struct transmission *current_request;
40770 +       spinlock_t           req_list_lock;
40771 +       wait_queue_head_t    req_wait_queue;
40772 +
40773 +       struct list_head     queued_requests;
40774 +
40775 +       struct transmission *current_response;
40776 +       spinlock_t           resp_list_lock;
40777 +       wait_queue_head_t    resp_wait_queue;     // processes waiting for responses
40778 +
40779 +       struct transmission *req_cancelled;       // if a cancellation was encounterd
40780 +
40781 +       unsigned int         fe_status;
40782 +       unsigned int         flags;
40783 +};
40784 +
40785 +enum {
40786 +       DATAEX_FLAG_QUEUED_ONLY = 0x1
40787 +};
40788 +
40789 +static struct data_exchange dataex;
40790 +
40791 +static unsigned long disconnect_time;
40792 +
40793 +static struct tpmfe_device tpmfe;
40794 +
40795 +/* local function prototypes */
40796 +static void __exit cleanup_xen(void);
40797 +
40798 +
40799 +/* =============================================================
40800 + * Some utility functions
40801 + * =============================================================
40802 + */
40803 +static inline struct transmission *
40804 +transmission_alloc(void)
40805 +{
40806 +       return kzalloc(sizeof(struct transmission), GFP_KERNEL);
40807 +}
40808 +
40809 +static inline unsigned char *
40810 +transmission_set_buffer(struct transmission *t,
40811 +                        unsigned char *buffer, unsigned int len)
40812 +{
40813 +       kfree(t->request);
40814 +       t->request = kmalloc(len, GFP_KERNEL);
40815 +       if (t->request) {
40816 +               memcpy(t->request,
40817 +                      buffer,
40818 +                      len);
40819 +               t->request_len = len;
40820 +       }
40821 +       return t->request;
40822 +}
40823 +
40824 +static inline void
40825 +transmission_free(struct transmission *t)
40826 +{
40827 +       kfree(t->request);
40828 +       kfree(t->rcv_buffer);
40829 +       kfree(t);
40830 +}
40831 +
40832 +/* =============================================================
40833 + * Interface with the TPM shared memory driver for XEN
40834 + * =============================================================
40835 + */
40836 +static int tpm_recv(const u8 *buffer, size_t count, const void *ptr)
40837 +{
40838 +       int ret_size = 0;
40839 +       struct transmission *t;
40840 +
40841 +       /*
40842 +        * The list with requests must contain one request
40843 +        * only and the element there must be the one that
40844 +        * was passed to me from the front-end.
40845 +        */
40846 +       if (dataex.current_request != ptr) {
40847 +               printk("WARNING: The request pointer is different than the "
40848 +                      "pointer the shared memory driver returned to me. "
40849 +                      "%p != %p\n",
40850 +                      dataex.current_request, ptr);
40851 +       }
40852 +
40853 +       /*
40854 +        * If the request has been cancelled, just quit here
40855 +        */
40856 +       if (dataex.req_cancelled == (struct transmission *)ptr) {
40857 +               if (dataex.current_request == dataex.req_cancelled) {
40858 +                       dataex.current_request = NULL;
40859 +               }
40860 +               transmission_free(dataex.req_cancelled);
40861 +               dataex.req_cancelled = NULL;
40862 +               return 0;
40863 +       }
40864 +
40865 +       if (NULL != (t = dataex.current_request)) {
40866 +               transmission_free(t);
40867 +               dataex.current_request = NULL;
40868 +       }
40869 +
40870 +       t = transmission_alloc();
40871 +       if (t) {
40872 +               unsigned long flags;
40873 +               t->rcv_buffer = kmalloc(count, GFP_KERNEL);
40874 +               if (! t->rcv_buffer) {
40875 +                       transmission_free(t);
40876 +                       return -ENOMEM;
40877 +               }
40878 +               t->buffersize = count;
40879 +               memcpy(t->rcv_buffer, buffer, count);
40880 +               ret_size = count;
40881 +
40882 +               spin_lock_irqsave(&dataex.resp_list_lock ,flags);
40883 +               dataex.current_response = t;
40884 +               spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40885 +               wake_up_interruptible(&dataex.resp_wait_queue);
40886 +       }
40887 +       return ret_size;
40888 +}
40889 +
40890 +
40891 +static void tpm_fe_status(unsigned int flags)
40892 +{
40893 +       dataex.fe_status = flags;
40894 +       if ((dataex.fe_status & TPMFE_STATUS_CONNECTED) == 0) {
40895 +               disconnect_time = jiffies;
40896 +       }
40897 +}
40898 +
40899 +/* =============================================================
40900 + * Interface with the generic TPM driver
40901 + * =============================================================
40902 + */
40903 +static int tpm_xen_recv(struct tpm_chip *chip, u8 * buf, size_t count)
40904 +{
40905 +       unsigned long flags;
40906 +       int rc = 0;
40907 +
40908 +       spin_lock_irqsave(&dataex.resp_list_lock, flags);
40909 +       /*
40910 +        * Check if the previous operation only queued the command
40911 +        * In this case there won't be a response, so I just
40912 +        * return from here and reset that flag. In any other
40913 +        * case I should receive a response from the back-end.
40914 +        */
40915 +       if ((dataex.flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
40916 +               dataex.flags &= ~DATAEX_FLAG_QUEUED_ONLY;
40917 +               spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40918 +               /*
40919 +                * a little hack here. The first few measurements
40920 +                * are queued since there's no way to talk to the
40921 +                * TPM yet (due to slowness of the control channel)
40922 +                * So we just make IMA happy by giving it 30 NULL
40923 +                * bytes back where the most important part is
40924 +                * that the result code is '0'.
40925 +                */
40926 +
40927 +               count = MIN(count, 30);
40928 +               memset(buf, 0x0, count);
40929 +               return count;
40930 +       }
40931 +       /*
40932 +        * Check whether something is in the responselist and if
40933 +        * there's nothing in the list wait for something to appear.
40934 +        */
40935 +
40936 +       if (NULL == dataex.current_response) {
40937 +               spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40938 +               interruptible_sleep_on_timeout(&dataex.resp_wait_queue,
40939 +                                              1000);
40940 +               spin_lock_irqsave(&dataex.resp_list_lock ,flags);
40941 +       }
40942 +
40943 +       if (NULL != dataex.current_response) {
40944 +               struct transmission *t = dataex.current_response;
40945 +               dataex.current_response = NULL;
40946 +               rc = MIN(count, t->buffersize);
40947 +               memcpy(buf, t->rcv_buffer, rc);
40948 +               transmission_free(t);
40949 +       }
40950 +
40951 +       spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40952 +       return rc;
40953 +}
40954 +
40955 +static int tpm_xen_send(struct tpm_chip *chip, u8 * buf, size_t count)
40956 +{
40957 +       /*
40958 +        * We simply pass the packet onto the XEN shared
40959 +        * memory driver.
40960 +        */
40961 +       unsigned long flags;
40962 +       int rc;
40963 +       struct transmission *t = transmission_alloc();
40964 +
40965 +       spin_lock_irqsave(&dataex.req_list_lock, flags);
40966 +       /*
40967 +        * If there's a current request, it must be the
40968 +        * previous request that has timed out.
40969 +        */
40970 +       if (dataex.current_request != NULL) {
40971 +               printk("WARNING: Sending although there is a request outstanding.\n"
40972 +                      "         Previous request must have timed out.\n");
40973 +               transmission_free(dataex.current_request);
40974 +               dataex.current_request = NULL;
40975 +       }
40976 +
40977 +       if (t != NULL) {
40978 +               unsigned int error = 0;
40979 +               /*
40980 +                * Queue the packet if the driver below is not
40981 +                * ready, yet, or there is any packet already
40982 +                * in the queue.
40983 +                * If the driver below is ready, unqueue all
40984 +                * packets first before sending our current
40985 +                * packet.
40986 +                * For each unqueued packet, except for the
40987 +                * last (=current) packet, call the function
40988 +                * tpm_xen_recv to wait for the response to come
40989 +                * back.
40990 +                */
40991 +               if ((dataex.fe_status & TPMFE_STATUS_CONNECTED) == 0) {
40992 +                       if (time_after(jiffies, disconnect_time + HZ * 10)) {
40993 +                               rc = -ENOENT;
40994 +                       } else {
40995 +                               /*
40996 +                                * copy the request into the buffer
40997 +                                */
40998 +                               if (transmission_set_buffer(t, buf, count)
40999 +                                   == NULL) {
41000 +                                       transmission_free(t);
41001 +                                       rc = -ENOMEM;
41002 +                                       goto exit;
41003 +                               }
41004 +                               dataex.flags |= DATAEX_FLAG_QUEUED_ONLY;
41005 +                               list_add_tail(&t->next, &dataex.queued_requests);
41006 +                               rc = 0;
41007 +                       }
41008 +               } else {
41009 +                       /*
41010 +                        * Check whether there are any packets in the queue
41011 +                        */
41012 +                       while (!list_empty(&dataex.queued_requests)) {
41013 +                               /*
41014 +                                * Need to dequeue them.
41015 +                                * Read the result into a dummy buffer.
41016 +                                */
41017 +                               unsigned char buffer[1];
41018 +                               struct transmission *qt = (struct transmission *) dataex.queued_requests.next;
41019 +                               list_del(&qt->next);
41020 +                               dataex.current_request = qt;
41021 +                               spin_unlock_irqrestore(&dataex.req_list_lock,
41022 +                                                      flags);
41023 +
41024 +                               rc = tpm_fe_send(tpmfe.tpm_private,
41025 +                                                qt->request,
41026 +                                                qt->request_len,
41027 +                                                qt);
41028 +
41029 +                               if (rc < 0) {
41030 +                                       spin_lock_irqsave(&dataex.req_list_lock, flags);
41031 +                                       if ((qt = dataex.current_request) != NULL) {
41032 +                                               /*
41033 +                                                * requeue it at the beginning
41034 +                                                * of the list
41035 +                                                */
41036 +                                               list_add(&qt->next,
41037 +                                                        &dataex.queued_requests);
41038 +                                       }
41039 +                                       dataex.current_request = NULL;
41040 +                                       error = 1;
41041 +                                       break;
41042 +                               }
41043 +                               /*
41044 +                                * After this point qt is not valid anymore!
41045 +                                * It is freed when the front-end is delivering the data
41046 +                                * by calling tpm_recv
41047 +                                */
41048 +
41049 +                               /*
41050 +                                * Try to receive the response now into the provided dummy
41051 +                                * buffer (I don't really care about this response since
41052 +                                * there is no receiver anymore for this response)
41053 +                                */
41054 +                               rc = tpm_xen_recv(chip, buffer, sizeof(buffer));
41055 +
41056 +                               spin_lock_irqsave(&dataex.req_list_lock, flags);
41057 +                       }
41058 +
41059 +                       if (error == 0) {
41060 +                               /*
41061 +                                * Finally, send the current request.
41062 +                                */
41063 +                               dataex.current_request = t;
41064 +                               /*
41065 +                                * Call the shared memory driver
41066 +                                * Pass to it the buffer with the request, the
41067 +                                * amount of bytes in the request and
41068 +                                * a void * pointer (here: transmission structure)
41069 +                                */
41070 +                               rc = tpm_fe_send(tpmfe.tpm_private,
41071 +                                                buf, count, t);
41072 +                               /*
41073 +                                * The generic TPM driver will call
41074 +                                * the function to receive the response.
41075 +                                */
41076 +                               if (rc < 0) {
41077 +                                       dataex.current_request = NULL;
41078 +                                       goto queue_it;
41079 +                               }
41080 +                       } else {
41081 +queue_it:
41082 +                               if (transmission_set_buffer(t, buf, count) == NULL) {
41083 +                                       transmission_free(t);
41084 +                                       rc = -ENOMEM;
41085 +                                       goto exit;
41086 +                               }
41087 +                               /*
41088 +                                * An error occurred. Don't event try
41089 +                                * to send the current request. Just
41090 +                                * queue it.
41091 +                                */
41092 +                               dataex.flags |= DATAEX_FLAG_QUEUED_ONLY;
41093 +                               list_add_tail(&t->next,
41094 +                                             &dataex.queued_requests);
41095 +                               rc = 0;
41096 +                       }
41097 +               }
41098 +       } else {
41099 +               rc = -ENOMEM;
41100 +       }
41101 +
41102 +exit:
41103 +       spin_unlock_irqrestore(&dataex.req_list_lock, flags);
41104 +       return rc;
41105 +}
41106 +
41107 +static void tpm_xen_cancel(struct tpm_chip *chip)
41108 +{
41109 +       unsigned long flags;
41110 +       spin_lock_irqsave(&dataex.resp_list_lock,flags);
41111 +
41112 +       dataex.req_cancelled = dataex.current_request;
41113 +
41114 +       spin_unlock_irqrestore(&dataex.resp_list_lock,flags);
41115 +}
41116 +
41117 +static u8 tpm_xen_status(struct tpm_chip *chip)
41118 +{
41119 +       unsigned long flags;
41120 +       u8 rc = 0;
41121 +       spin_lock_irqsave(&dataex.resp_list_lock, flags);
41122 +       /*
41123 +        * Data are available if:
41124 +        *  - there's a current response
41125 +        *  - the last packet was queued only (this is fake, but necessary to
41126 +        *      get the generic TPM layer to call the receive function.)
41127 +        */
41128 +       if (NULL != dataex.current_response ||
41129 +           0 != (dataex.flags & DATAEX_FLAG_QUEUED_ONLY)) {
41130 +               rc = STATUS_DATA_AVAIL;
41131 +       }
41132 +       spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
41133 +       return rc;
41134 +}
41135 +
41136 +static struct file_operations tpm_xen_ops = {
41137 +       .owner = THIS_MODULE,
41138 +       .llseek = no_llseek,
41139 +       .open = tpm_open,
41140 +       .read = tpm_read,
41141 +       .write = tpm_write,
41142 +       .release = tpm_release,
41143 +};
41144 +
41145 +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
41146 +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
41147 +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
41148 +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
41149 +
41150 +static struct attribute* xen_attrs[] = {
41151 +       &dev_attr_pubek.attr,
41152 +       &dev_attr_pcrs.attr,
41153 +       &dev_attr_caps.attr,
41154 +       &dev_attr_cancel.attr,
41155 +       NULL,
41156 +};
41157 +
41158 +static struct attribute_group xen_attr_grp = { .attrs = xen_attrs };
41159 +
41160 +static struct tpm_vendor_specific tpm_xen = {
41161 +       .recv = tpm_xen_recv,
41162 +       .send = tpm_xen_send,
41163 +       .cancel = tpm_xen_cancel,
41164 +       .status = tpm_xen_status,
41165 +       .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
41166 +       .req_complete_val  = STATUS_DATA_AVAIL,
41167 +       .req_canceled = STATUS_READY,
41168 +       .base = 0,
41169 +       .attr_group = &xen_attr_grp,
41170 +       .miscdev.fops = &tpm_xen_ops,
41171 +       .buffersize = 64 * 1024,
41172 +};
41173 +
41174 +static struct platform_device *pdev;
41175 +
41176 +static struct tpmfe_device tpmfe = {
41177 +       .receive = tpm_recv,
41178 +       .status  = tpm_fe_status,
41179 +};
41180 +
41181 +
41182 +static int __init init_xen(void)
41183 +{
41184 +       int rc;
41185 +
41186 +       if ((xen_start_info->flags & SIF_INITDOMAIN)) {
41187 +               return -EPERM;
41188 +       }
41189 +       /*
41190 +        * Register device with the low lever front-end
41191 +        * driver
41192 +        */
41193 +       if ((rc = tpm_fe_register_receiver(&tpmfe)) < 0) {
41194 +               goto err_exit;
41195 +       }
41196 +
41197 +       /*
41198 +        * Register our device with the system.
41199 +        */
41200 +       pdev = platform_device_register_simple("tpm_vtpm", -1, NULL, 0);
41201 +       if (IS_ERR(pdev)) {
41202 +               rc = PTR_ERR(pdev);
41203 +               goto err_unreg_fe;
41204 +       }
41205 +
41206 +       tpm_xen.buffersize = tpmfe.max_tx_size;
41207 +
41208 +       if ((rc = tpm_register_hardware(&pdev->dev, &tpm_xen)) < 0) {
41209 +               goto err_unreg_pdev;
41210 +       }
41211 +
41212 +       dataex.current_request = NULL;
41213 +       spin_lock_init(&dataex.req_list_lock);
41214 +       init_waitqueue_head(&dataex.req_wait_queue);
41215 +       INIT_LIST_HEAD(&dataex.queued_requests);
41216 +
41217 +       dataex.current_response = NULL;
41218 +       spin_lock_init(&dataex.resp_list_lock);
41219 +       init_waitqueue_head(&dataex.resp_wait_queue);
41220 +
41221 +       disconnect_time = jiffies;
41222 +
41223 +       return 0;
41224 +
41225 +
41226 +err_unreg_pdev:
41227 +       platform_device_unregister(pdev);
41228 +err_unreg_fe:
41229 +       tpm_fe_unregister_receiver();
41230 +
41231 +err_exit:
41232 +       return rc;
41233 +}
41234 +
41235 +static void __exit cleanup_xen(void)
41236 +{
41237 +       struct tpm_chip *chip = dev_get_drvdata(&pdev->dev);
41238 +       if (chip) {
41239 +               tpm_remove_hardware(chip->dev);
41240 +               platform_device_unregister(pdev);
41241 +               tpm_fe_unregister_receiver();
41242 +       }
41243 +}
41244 +
41245 +module_init(init_xen);
41246 +module_exit(cleanup_xen);
41247 +
41248 +MODULE_AUTHOR("Stefan Berger (stefanb@us.ibm.com)");
41249 +MODULE_DESCRIPTION("TPM Driver for XEN (shared memory)");
41250 +MODULE_VERSION("1.0");
41251 +MODULE_LICENSE("GPL");
41252 diff -Nurp ref-linux-2.6.16.9/drivers/char/tty_io.c tmp-linux-2.6-xen.patch/drivers/char/tty_io.c
41253 --- ref-linux-2.6.16.9/drivers/char/tty_io.c    2006-04-19 08:10:14.000000000 +0200
41254 +++ tmp-linux-2.6-xen.patch/drivers/char/tty_io.c       2006-04-10 00:05:52.000000000 +0200
41255 @@ -132,6 +132,8 @@ LIST_HEAD(tty_drivers);                     /* linked list
41256     vt.c for deeply disgusting hack reasons */
41257  DECLARE_MUTEX(tty_sem);
41258  
41259 +int console_use_vt = 1;
41260 +
41261  #ifdef CONFIG_UNIX98_PTYS
41262  extern struct tty_driver *ptm_driver;  /* Unix98 pty masters; for /dev/ptmx */
41263  extern int pty_limit;          /* Config limit on Unix98 ptys */
41264 @@ -2054,7 +2056,7 @@ retry_open:
41265                 goto got_driver;
41266         }
41267  #ifdef CONFIG_VT
41268 -       if (device == MKDEV(TTY_MAJOR,0)) {
41269 +       if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) {
41270                 extern struct tty_driver *console_driver;
41271                 driver = console_driver;
41272                 index = fg_console;
41273 @@ -3237,6 +3239,8 @@ static int __init tty_init(void)
41274  #endif
41275  
41276  #ifdef CONFIG_VT
41277 +       if (!console_use_vt)
41278 +               goto out_vt;
41279         cdev_init(&vc0_cdev, &console_fops);
41280         if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
41281             register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
41282 @@ -3245,6 +3249,7 @@ static int __init tty_init(void)
41283         class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
41284  
41285         vty_init();
41286 + out_vt:
41287  #endif
41288         return 0;
41289  }
41290 diff -Nurp ref-linux-2.6.16.9/drivers/firmware/Kconfig tmp-linux-2.6-xen.patch/drivers/firmware/Kconfig
41291 --- ref-linux-2.6.16.9/drivers/firmware/Kconfig 2006-04-19 08:10:14.000000000 +0200
41292 +++ tmp-linux-2.6-xen.patch/drivers/firmware/Kconfig    2006-04-10 00:05:52.000000000 +0200
41293 @@ -8,7 +8,7 @@ menu "Firmware Drivers"
41294  config EDD
41295         tristate "BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)"
41296         depends on EXPERIMENTAL
41297 -       depends on !IA64
41298 +       depends on !IA64 && !XEN
41299         help
41300           Say Y or M here if you want to enable BIOS Enhanced Disk Drive
41301           Services real mode BIOS calls to determine which disk
41302 diff -Nurp ref-linux-2.6.16.9/drivers/Makefile tmp-linux-2.6-xen.patch/drivers/Makefile
41303 --- ref-linux-2.6.16.9/drivers/Makefile 2006-04-19 08:10:14.000000000 +0200
41304 +++ tmp-linux-2.6-xen.patch/drivers/Makefile    2006-04-10 00:05:52.000000000 +0200
41305 @@ -34,6 +34,7 @@ obj-y                         += base/ block/ misc/ mfd/ net/
41306  obj-$(CONFIG_NUBUS)            += nubus/
41307  obj-$(CONFIG_ATM)              += atm/
41308  obj-$(CONFIG_PPC_PMAC)         += macintosh/
41309 +obj-$(CONFIG_XEN)              += xen/
41310  obj-$(CONFIG_IDE)              += ide/
41311  obj-$(CONFIG_FC4)              += fc4/
41312  obj-$(CONFIG_SCSI)             += scsi/
41313 diff -Nurp ref-linux-2.6.16.9/drivers/pci/Kconfig tmp-linux-2.6-xen.patch/drivers/pci/Kconfig
41314 --- ref-linux-2.6.16.9/drivers/pci/Kconfig      2006-04-19 08:10:14.000000000 +0200
41315 +++ tmp-linux-2.6-xen.patch/drivers/pci/Kconfig 2006-04-10 00:05:52.000000000 +0200
41316 @@ -5,6 +5,7 @@ config PCI_MSI
41317         bool "Message Signaled Interrupts (MSI and MSI-X)"
41318         depends on PCI
41319         depends on (X86_LOCAL_APIC && X86_IO_APIC) || IA64
41320 +       depends on !XEN
41321         help
41322            This allows device drivers to enable MSI (Message Signaled
41323            Interrupts).  Message Signaled Interrupts enable a device to
41324 diff -Nurp ref-linux-2.6.16.9/drivers/serial/Kconfig tmp-linux-2.6-xen.patch/drivers/serial/Kconfig
41325 --- ref-linux-2.6.16.9/drivers/serial/Kconfig   2006-04-19 08:10:14.000000000 +0200
41326 +++ tmp-linux-2.6-xen.patch/drivers/serial/Kconfig      2006-04-10 00:05:52.000000000 +0200
41327 @@ -11,6 +11,7 @@ menu "Serial drivers"
41328  config SERIAL_8250
41329         tristate "8250/16550 and compatible serial support"
41330         depends on (BROKEN || !SPARC)
41331 +       depends on !XEN_DISABLE_SERIAL
41332         select SERIAL_CORE
41333         ---help---
41334           This selects whether you want to include the driver for the standard
41335 diff -Nurp ref-linux-2.6.16.9/drivers/video/Kconfig tmp-linux-2.6-xen.patch/drivers/video/Kconfig
41336 --- ref-linux-2.6.16.9/drivers/video/Kconfig    2006-04-19 08:10:14.000000000 +0200
41337 +++ tmp-linux-2.6-xen.patch/drivers/video/Kconfig       2006-04-10 00:05:52.000000000 +0200
41338 @@ -495,7 +495,7 @@ config FB_HGA_ACCEL
41339  
41340  config VIDEO_SELECT
41341         bool
41342 -       depends on (FB = y) && X86
41343 +       depends on (FB = y) && X86 && !XEN
41344         default y
41345  
41346  config FB_SGIVW
41347 diff -Nurp ref-linux-2.6.16.9/drivers/xen/balloon/balloon.c tmp-linux-2.6-xen.patch/drivers/xen/balloon/balloon.c
41348 --- ref-linux-2.6.16.9/drivers/xen/balloon/balloon.c    1970-01-01 01:00:00.000000000 +0100
41349 +++ tmp-linux-2.6-xen.patch/drivers/xen/balloon/balloon.c       2006-04-10 00:05:52.000000000 +0200
41350 @@ -0,0 +1,592 @@
41351 +/******************************************************************************
41352 + * balloon.c
41353 + *
41354 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
41355 + *
41356 + * Copyright (c) 2003, B Dragovic
41357 + * Copyright (c) 2003-2004, M Williamson, K Fraser
41358 + * Copyright (c) 2005 Dan M. Smith, IBM Corporation
41359 + * 
41360 + * This program is free software; you can redistribute it and/or
41361 + * modify it under the terms of the GNU General Public License version 2
41362 + * as published by the Free Software Foundation; or, when distributed
41363 + * separately from the Linux kernel or incorporated into other
41364 + * software packages, subject to the following license:
41365 + * 
41366 + * Permission is hereby granted, free of charge, to any person obtaining a copy
41367 + * of this source file (the "Software"), to deal in the Software without
41368 + * restriction, including without limitation the rights to use, copy, modify,
41369 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41370 + * and to permit persons to whom the Software is furnished to do so, subject to
41371 + * the following conditions:
41372 + * 
41373 + * The above copyright notice and this permission notice shall be included in
41374 + * all copies or substantial portions of the Software.
41375 + * 
41376 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41377 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41378 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41379 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41380 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
41381 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41382 + * IN THE SOFTWARE.
41383 + */
41384 +
41385 +#include <linux/config.h>
41386 +#include <linux/kernel.h>
41387 +#include <linux/module.h>
41388 +#include <linux/sched.h>
41389 +#include <linux/errno.h>
41390 +#include <linux/mm.h>
41391 +#include <linux/mman.h>
41392 +#include <linux/smp_lock.h>
41393 +#include <linux/pagemap.h>
41394 +#include <linux/bootmem.h>
41395 +#include <linux/highmem.h>
41396 +#include <linux/vmalloc.h>
41397 +#include <xen/xen_proc.h>
41398 +#include <asm/hypervisor.h>
41399 +#include <xen/balloon.h>
41400 +#include <xen/interface/memory.h>
41401 +#include <asm/pgalloc.h>
41402 +#include <asm/pgtable.h>
41403 +#include <asm/uaccess.h>
41404 +#include <asm/tlb.h>
41405 +#include <linux/list.h>
41406 +
41407 +#include <xen/xenbus.h>
41408 +
41409 +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
41410 +
41411 +static struct proc_dir_entry *balloon_pde;
41412 +
41413 +static DECLARE_MUTEX(balloon_mutex);
41414 +
41415 +/*
41416 + * Protects atomic reservation decrease/increase against concurrent increases.
41417 + * Also protects non-atomic updates of current_pages and driver_pages, and
41418 + * balloon lists.
41419 + */
41420 +spinlock_t balloon_lock = SPIN_LOCK_UNLOCKED;
41421 +
41422 +/* We aim for 'current allocation' == 'target allocation'. */
41423 +static unsigned long current_pages;
41424 +static unsigned long target_pages;
41425 +
41426 +/* VM /proc information for memory */
41427 +extern unsigned long totalram_pages;
41428 +
41429 +/* We may hit the hard limit in Xen. If we do then we remember it. */
41430 +static unsigned long hard_limit;
41431 +
41432 +/*
41433 + * Drivers may alter the memory reservation independently, but they must
41434 + * inform the balloon driver so that we can avoid hitting the hard limit.
41435 + */
41436 +static unsigned long driver_pages;
41437 +
41438 +/* List of ballooned pages, threaded through the mem_map array. */
41439 +static LIST_HEAD(ballooned_pages);
41440 +static unsigned long balloon_low, balloon_high;
41441 +
41442 +/* Main work function, always executed in process context. */
41443 +static void balloon_process(void *unused);
41444 +static DECLARE_WORK(balloon_worker, balloon_process, NULL);
41445 +static struct timer_list balloon_timer;
41446 +
41447 +#define PAGE_TO_LIST(p) (&(p)->ballooned)
41448 +#define LIST_TO_PAGE(l) list_entry((l), struct page, ballooned)
41449 +#define UNLIST_PAGE(p)                         \
41450 +       do {                                    \
41451 +               list_del(PAGE_TO_LIST(p));      \
41452 +               PAGE_TO_LIST(p)->next = NULL;   \
41453 +               PAGE_TO_LIST(p)->prev = NULL;   \
41454 +       } while(0)
41455 +
41456 +#define IPRINTK(fmt, args...) \
41457 +       printk(KERN_INFO "xen_mem: " fmt, ##args)
41458 +#define WPRINTK(fmt, args...) \
41459 +       printk(KERN_WARNING "xen_mem: " fmt, ##args)
41460 +
41461 +/* balloon_append: add the given page to the balloon. */
41462 +static void balloon_append(struct page *page)
41463 +{
41464 +       /* Lowmem is re-populated first, so highmem pages go at list tail. */
41465 +       if (PageHighMem(page)) {
41466 +               list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
41467 +               balloon_high++;
41468 +       } else {
41469 +               list_add(PAGE_TO_LIST(page), &ballooned_pages);
41470 +               balloon_low++;
41471 +       }
41472 +}
41473 +
41474 +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
41475 +static struct page *balloon_retrieve(void)
41476 +{
41477 +       struct page *page;
41478 +
41479 +       if (list_empty(&ballooned_pages))
41480 +               return NULL;
41481 +
41482 +       page = LIST_TO_PAGE(ballooned_pages.next);
41483 +       UNLIST_PAGE(page);
41484 +
41485 +       if (PageHighMem(page))
41486 +               balloon_high--;
41487 +       else
41488 +               balloon_low--;
41489 +
41490 +       return page;
41491 +}
41492 +
41493 +static struct page *balloon_first_page(void)
41494 +{
41495 +       if (list_empty(&ballooned_pages))
41496 +               return NULL;
41497 +       return LIST_TO_PAGE(ballooned_pages.next);
41498 +}
41499 +
41500 +static struct page *balloon_next_page(struct page *page)
41501 +{
41502 +       struct list_head *next = PAGE_TO_LIST(page)->next;
41503 +       if (next == &ballooned_pages)
41504 +               return NULL;
41505 +       return LIST_TO_PAGE(next);
41506 +}
41507 +
41508 +static void balloon_alarm(unsigned long unused)
41509 +{
41510 +       schedule_work(&balloon_worker);
41511 +}
41512 +
41513 +static unsigned long current_target(void)
41514 +{
41515 +       unsigned long target = min(target_pages, hard_limit);
41516 +       if (target > (current_pages + balloon_low + balloon_high))
41517 +               target = current_pages + balloon_low + balloon_high;
41518 +       return target;
41519 +}
41520 +
41521 +static int increase_reservation(unsigned long nr_pages)
41522 +{
41523 +       unsigned long *frame_list, pfn, i, flags;
41524 +       struct page   *page;
41525 +       long           rc;
41526 +       struct xen_memory_reservation reservation = {
41527 +               .address_bits = 0,
41528 +               .extent_order = 0,
41529 +               .domid        = DOMID_SELF
41530 +       };
41531 +
41532 +       if (nr_pages > (PAGE_SIZE / sizeof(unsigned long)))
41533 +               nr_pages = PAGE_SIZE / sizeof(unsigned long);
41534 +
41535 +       frame_list = (unsigned long *)__get_free_page(GFP_KERNEL);
41536 +       if (frame_list == NULL)
41537 +               return -ENOMEM;
41538 +
41539 +       balloon_lock(flags);
41540 +
41541 +       page = balloon_first_page();
41542 +       for (i = 0; i < nr_pages; i++) {
41543 +               BUG_ON(page == NULL);
41544 +               frame_list[i] = page_to_pfn(page);;
41545 +               page = balloon_next_page(page);
41546 +       }
41547 +
41548 +       reservation.extent_start = frame_list;
41549 +       reservation.nr_extents   = nr_pages;
41550 +       rc = HYPERVISOR_memory_op(
41551 +               XENMEM_populate_physmap, &reservation);
41552 +       if (rc < nr_pages) {
41553 +               int ret;
41554 +               /* We hit the Xen hard limit: reprobe. */
41555 +               reservation.extent_start = frame_list;
41556 +               reservation.nr_extents   = rc;
41557 +               ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
41558 +                               &reservation);
41559 +               BUG_ON(ret != rc);
41560 +               hard_limit = current_pages + rc - driver_pages;
41561 +               goto out;
41562 +       }
41563 +
41564 +       for (i = 0; i < nr_pages; i++) {
41565 +               page = balloon_retrieve();
41566 +               BUG_ON(page == NULL);
41567 +
41568 +               pfn = page_to_pfn(page);
41569 +               BUG_ON(phys_to_machine_mapping_valid(pfn));
41570 +
41571 +               /* Update P->M and M->P tables. */
41572 +               set_phys_to_machine(pfn, frame_list[i]);
41573 +               xen_machphys_update(frame_list[i], pfn);
41574 +            
41575 +               /* Link back into the page tables if not highmem. */
41576 +               if (pfn < max_low_pfn) {
41577 +                       int ret;
41578 +                       ret = HYPERVISOR_update_va_mapping(
41579 +                               (unsigned long)__va(pfn << PAGE_SHIFT),
41580 +                               pfn_pte_ma(frame_list[i], PAGE_KERNEL),
41581 +                               0);
41582 +                       BUG_ON(ret);
41583 +               }
41584 +
41585 +               /* Relinquish the page back to the allocator. */
41586 +               ClearPageReserved(page);
41587 +               set_page_count(page, 1);
41588 +               __free_page(page);
41589 +       }
41590 +
41591 +       current_pages += nr_pages;
41592 +       totalram_pages = current_pages;
41593 +
41594 + out:
41595 +       balloon_unlock(flags);
41596 +
41597 +       free_page((unsigned long)frame_list);
41598 +
41599 +       return 0;
41600 +}
41601 +
41602 +static int decrease_reservation(unsigned long nr_pages)
41603 +{
41604 +       unsigned long *frame_list, pfn, i, flags;
41605 +       struct page   *page;
41606 +       void          *v;
41607 +       int            need_sleep = 0;
41608 +       int ret;
41609 +       struct xen_memory_reservation reservation = {
41610 +               .address_bits = 0,
41611 +               .extent_order = 0,
41612 +               .domid        = DOMID_SELF
41613 +       };
41614 +
41615 +       if (nr_pages > (PAGE_SIZE / sizeof(unsigned long)))
41616 +               nr_pages = PAGE_SIZE / sizeof(unsigned long);
41617 +
41618 +       frame_list = (unsigned long *)__get_free_page(GFP_KERNEL);
41619 +       if (frame_list == NULL)
41620 +               return -ENOMEM;
41621 +
41622 +       for (i = 0; i < nr_pages; i++) {
41623 +               if ((page = alloc_page(GFP_HIGHUSER)) == NULL) {
41624 +                       nr_pages = i;
41625 +                       need_sleep = 1;
41626 +                       break;
41627 +               }
41628 +
41629 +               pfn = page_to_pfn(page);
41630 +               frame_list[i] = pfn_to_mfn(pfn);
41631 +
41632 +               if (!PageHighMem(page)) {
41633 +                       v = phys_to_virt(pfn << PAGE_SHIFT);
41634 +                       scrub_pages(v, 1);
41635 +                       ret = HYPERVISOR_update_va_mapping(
41636 +                               (unsigned long)v, __pte_ma(0), 0);
41637 +                       BUG_ON(ret);
41638 +               }
41639 +#ifdef CONFIG_XEN_SCRUB_PAGES
41640 +               else {
41641 +                       v = kmap(page);
41642 +                       scrub_pages(v, 1);
41643 +                       kunmap(page);
41644 +               }
41645 +#endif
41646 +       }
41647 +
41648 +       /* Ensure that ballooned highmem pages don't have kmaps. */
41649 +       kmap_flush_unused();
41650 +       flush_tlb_all();
41651 +
41652 +       balloon_lock(flags);
41653 +
41654 +       /* No more mappings: invalidate P2M and add to balloon. */
41655 +       for (i = 0; i < nr_pages; i++) {
41656 +               pfn = mfn_to_pfn(frame_list[i]);
41657 +               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
41658 +               balloon_append(pfn_to_page(pfn));
41659 +       }
41660 +
41661 +       reservation.extent_start = frame_list;
41662 +       reservation.nr_extents   = nr_pages;
41663 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
41664 +       BUG_ON(ret != nr_pages);
41665 +
41666 +       current_pages -= nr_pages;
41667 +       totalram_pages = current_pages;
41668 +
41669 +       balloon_unlock(flags);
41670 +
41671 +       free_page((unsigned long)frame_list);
41672 +
41673 +       return need_sleep;
41674 +}
41675 +
41676 +/*
41677 + * We avoid multiple worker processes conflicting via the balloon mutex.
41678 + * We may of course race updates of the target counts (which are protected
41679 + * by the balloon lock), or with changes to the Xen hard limit, but we will
41680 + * recover from these in time.
41681 + */
41682 +static void balloon_process(void *unused)
41683 +{
41684 +       int need_sleep = 0;
41685 +       long credit;
41686 +
41687 +       down(&balloon_mutex);
41688 +
41689 +       do {
41690 +               credit = current_target() - current_pages;
41691 +               if (credit > 0)
41692 +                       need_sleep = (increase_reservation(credit) != 0);
41693 +               if (credit < 0)
41694 +                       need_sleep = (decrease_reservation(-credit) != 0);
41695 +
41696 +#ifndef CONFIG_PREEMPT
41697 +               if (need_resched())
41698 +                       schedule();
41699 +#endif
41700 +       } while ((credit != 0) && !need_sleep);
41701 +
41702 +       /* Schedule more work if there is some still to be done. */
41703 +       if (current_target() != current_pages)
41704 +               mod_timer(&balloon_timer, jiffies + HZ);
41705 +
41706 +       up(&balloon_mutex);
41707 +}
41708 +
41709 +/* Resets the Xen limit, sets new target, and kicks off processing. */
41710 +static void set_new_target(unsigned long target)
41711 +{
41712 +       /* No need for lock. Not read-modify-write updates. */
41713 +       hard_limit   = ~0UL;
41714 +       target_pages = target;
41715 +       schedule_work(&balloon_worker);
41716 +}
41717 +
41718 +static struct xenbus_watch target_watch =
41719 +{
41720 +       .node = "memory/target"
41721 +};
41722 +
41723 +/* React to a change in the target key */
41724 +static void watch_target(struct xenbus_watch *watch,
41725 +                        const char **vec, unsigned int len)
41726 +{
41727 +       unsigned long long new_target;
41728 +       int err;
41729 +
41730 +       err = xenbus_scanf(XBT_NULL, "memory", "target", "%llu", &new_target);
41731 +       if (err != 1) {
41732 +               /* This is ok (for domain0 at least) - so just return */
41733 +               return;
41734 +       } 
41735 +        
41736 +       /* The given memory/target value is in KiB, so it needs converting to
41737 +          pages.  PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
41738 +       */
41739 +       set_new_target(new_target >> (PAGE_SHIFT - 10));
41740 +    
41741 +}
41742 +
41743 +static int balloon_init_watcher(struct notifier_block *notifier,
41744 +                                unsigned long event,
41745 +                                void *data)
41746 +{
41747 +       int err;
41748 +
41749 +       err = register_xenbus_watch(&target_watch);
41750 +       if (err)
41751 +               printk(KERN_ERR "Failed to set balloon watcher\n");
41752 +
41753 +       return NOTIFY_DONE;
41754 +    
41755 +}
41756 +
41757 +static int balloon_write(struct file *file, const char __user *buffer,
41758 +                         unsigned long count, void *data)
41759 +{
41760 +       char memstring[64], *endchar;
41761 +       unsigned long long target_bytes;
41762 +
41763 +       if (!capable(CAP_SYS_ADMIN))
41764 +               return -EPERM;
41765 +
41766 +       if (count <= 1)
41767 +               return -EBADMSG; /* runt */
41768 +       if (count > sizeof(memstring))
41769 +               return -EFBIG;   /* too long */
41770 +
41771 +       if (copy_from_user(memstring, buffer, count))
41772 +               return -EFAULT;
41773 +       memstring[sizeof(memstring)-1] = '\0';
41774 +
41775 +       target_bytes = memparse(memstring, &endchar);
41776 +       set_new_target(target_bytes >> PAGE_SHIFT);
41777 +
41778 +       return count;
41779 +}
41780 +
41781 +static int balloon_read(char *page, char **start, off_t off,
41782 +                        int count, int *eof, void *data)
41783 +{
41784 +       int len;
41785 +
41786 +       len = sprintf(
41787 +               page,
41788 +               "Current allocation: %8lu kB\n"
41789 +               "Requested target:   %8lu kB\n"
41790 +               "Low-mem balloon:    %8lu kB\n"
41791 +               "High-mem balloon:   %8lu kB\n"
41792 +               "Xen hard limit:     ",
41793 +               PAGES2KB(current_pages), PAGES2KB(target_pages), 
41794 +               PAGES2KB(balloon_low), PAGES2KB(balloon_high));
41795 +
41796 +       if (hard_limit != ~0UL) {
41797 +               len += sprintf(
41798 +                       page + len, 
41799 +                       "%8lu kB (inc. %8lu kB driver headroom)\n",
41800 +                       PAGES2KB(hard_limit), PAGES2KB(driver_pages));
41801 +       } else {
41802 +               len += sprintf(
41803 +                       page + len,
41804 +                       "     ??? kB\n");
41805 +       }
41806 +
41807 +       *eof = 1;
41808 +       return len;
41809 +}
41810 +
41811 +static struct notifier_block xenstore_notifier;
41812 +
41813 +static int __init balloon_init(void)
41814 +{
41815 +       unsigned long pfn;
41816 +       struct page *page;
41817 +
41818 +       IPRINTK("Initialising balloon driver.\n");
41819 +
41820 +       if (xen_init() < 0)
41821 +               return -1;
41822 +
41823 +       current_pages = min(xen_start_info->nr_pages, max_pfn);
41824 +       totalram_pages = current_pages;
41825 +       target_pages  = current_pages;
41826 +       balloon_low   = 0;
41827 +       balloon_high  = 0;
41828 +       driver_pages  = 0UL;
41829 +       hard_limit    = ~0UL;
41830 +
41831 +       init_timer(&balloon_timer);
41832 +       balloon_timer.data = 0;
41833 +       balloon_timer.function = balloon_alarm;
41834 +    
41835 +       if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
41836 +               WPRINTK("Unable to create /proc/xen/balloon.\n");
41837 +               return -1;
41838 +       }
41839 +
41840 +       balloon_pde->read_proc  = balloon_read;
41841 +       balloon_pde->write_proc = balloon_write;
41842 +    
41843 +       /* Initialise the balloon with excess memory space. */
41844 +       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
41845 +               page = pfn_to_page(pfn);
41846 +               if (!PageReserved(page))
41847 +                       balloon_append(page);
41848 +       }
41849 +
41850 +       target_watch.callback = watch_target;
41851 +       xenstore_notifier.notifier_call = balloon_init_watcher;
41852 +
41853 +       register_xenstore_notifier(&xenstore_notifier);
41854 +    
41855 +       return 0;
41856 +}
41857 +
41858 +subsys_initcall(balloon_init);
41859 +
41860 +void balloon_update_driver_allowance(long delta)
41861 +{
41862 +       unsigned long flags;
41863 +
41864 +       balloon_lock(flags);
41865 +       driver_pages += delta;
41866 +       balloon_unlock(flags);
41867 +}
41868 +
41869 +static int dealloc_pte_fn(
41870 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
41871 +{
41872 +       unsigned long mfn = pte_mfn(*pte);
41873 +       int ret;
41874 +       struct xen_memory_reservation reservation = {
41875 +               .extent_start = &mfn,
41876 +               .nr_extents   = 1,
41877 +               .extent_order = 0,
41878 +               .domid        = DOMID_SELF
41879 +       };
41880 +       set_pte_at(&init_mm, addr, pte, __pte_ma(0));
41881 +       set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
41882 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
41883 +       BUG_ON(ret != 1);
41884 +       return 0;
41885 +}
41886 +
41887 +struct page *balloon_alloc_empty_page_range(unsigned long nr_pages)
41888 +{
41889 +       unsigned long vstart, flags;
41890 +       unsigned int  order = get_order(nr_pages * PAGE_SIZE);
41891 +       int ret;
41892 +
41893 +       vstart = __get_free_pages(GFP_KERNEL, order);
41894 +       if (vstart == 0)
41895 +               return NULL;
41896 +
41897 +       scrub_pages(vstart, 1 << order);
41898 +
41899 +       balloon_lock(flags);
41900 +       ret = apply_to_page_range(&init_mm, vstart,
41901 +                                 PAGE_SIZE << order, dealloc_pte_fn, NULL);
41902 +       BUG_ON(ret);
41903 +       current_pages -= 1UL << order;
41904 +       totalram_pages = current_pages;
41905 +       balloon_unlock(flags);
41906 +
41907 +       schedule_work(&balloon_worker);
41908 +
41909 +       flush_tlb_all();
41910 +
41911 +       return virt_to_page(vstart);
41912 +}
41913 +
41914 +void balloon_dealloc_empty_page_range(
41915 +       struct page *page, unsigned long nr_pages)
41916 +{
41917 +       unsigned long i, flags;
41918 +       unsigned int  order = get_order(nr_pages * PAGE_SIZE);
41919 +
41920 +       balloon_lock(flags);
41921 +       for (i = 0; i < (1UL << order); i++)
41922 +               balloon_append(page + i);
41923 +       balloon_unlock(flags);
41924 +
41925 +       schedule_work(&balloon_worker);
41926 +}
41927 +
41928 +EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
41929 +EXPORT_SYMBOL_GPL(balloon_alloc_empty_page_range);
41930 +EXPORT_SYMBOL_GPL(balloon_dealloc_empty_page_range);
41931 +
41932 +MODULE_LICENSE("Dual BSD/GPL");
41933 +
41934 +/*
41935 + * Local variables:
41936 + *  c-file-style: "linux"
41937 + *  indent-tabs-mode: t
41938 + *  c-indent-level: 8
41939 + *  c-basic-offset: 8
41940 + *  tab-width: 8
41941 + * End:
41942 + */
41943 diff -Nurp ref-linux-2.6.16.9/drivers/xen/balloon/Makefile tmp-linux-2.6-xen.patch/drivers/xen/balloon/Makefile
41944 --- ref-linux-2.6.16.9/drivers/xen/balloon/Makefile     1970-01-01 01:00:00.000000000 +0100
41945 +++ tmp-linux-2.6-xen.patch/drivers/xen/balloon/Makefile        2006-04-10 00:05:52.000000000 +0200
41946 @@ -0,0 +1,2 @@
41947 +
41948 +obj-y += balloon.o
41949 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkback/blkback.c tmp-linux-2.6-xen.patch/drivers/xen/blkback/blkback.c
41950 --- ref-linux-2.6.16.9/drivers/xen/blkback/blkback.c    1970-01-01 01:00:00.000000000 +0100
41951 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkback/blkback.c       2006-04-10 00:05:52.000000000 +0200
41952 @@ -0,0 +1,604 @@
41953 +/******************************************************************************
41954 + * arch/xen/drivers/blkif/backend/main.c
41955 + * 
41956 + * Back-end of the driver for virtual block devices. This portion of the
41957 + * driver exports a 'unified' block-device interface that can be accessed
41958 + * by any operating system that implements a compatible front end. A 
41959 + * reference front-end implementation can be found in:
41960 + *  arch/xen/drivers/blkif/frontend
41961 + * 
41962 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
41963 + * Copyright (c) 2005, Christopher Clark
41964 + * 
41965 + * This program is free software; you can redistribute it and/or
41966 + * modify it under the terms of the GNU General Public License version 2
41967 + * as published by the Free Software Foundation; or, when distributed
41968 + * separately from the Linux kernel or incorporated into other
41969 + * software packages, subject to the following license:
41970 + * 
41971 + * Permission is hereby granted, free of charge, to any person obtaining a copy
41972 + * of this source file (the "Software"), to deal in the Software without
41973 + * restriction, including without limitation the rights to use, copy, modify,
41974 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41975 + * and to permit persons to whom the Software is furnished to do so, subject to
41976 + * the following conditions:
41977 + * 
41978 + * The above copyright notice and this permission notice shall be included in
41979 + * all copies or substantial portions of the Software.
41980 + * 
41981 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41982 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41983 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41984 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41985 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
41986 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41987 + * IN THE SOFTWARE.
41988 + */
41989 +
41990 +#include <linux/spinlock.h>
41991 +#include <linux/kthread.h>
41992 +#include <linux/list.h>
41993 +#include <xen/balloon.h>
41994 +#include <asm/hypervisor.h>
41995 +#include "common.h"
41996 +
41997 +/*
41998 + * These are rather arbitrary. They are fairly large because adjacent requests
41999 + * pulled from a communication ring are quite likely to end up being part of
42000 + * the same scatter/gather request at the disc.
42001 + * 
42002 + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
42003 + * 
42004 + * This will increase the chances of being able to write whole tracks.
42005 + * 64 should be enough to keep us competitive with Linux.
42006 + */
42007 +static int blkif_reqs = 64;
42008 +module_param_named(reqs, blkif_reqs, int, 0);
42009 +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
42010 +
42011 +static int mmap_pages;
42012 +
42013 +/* Run-time switchable: /sys/module/blkback/parameters/ */
42014 +static unsigned int log_stats = 0;
42015 +static unsigned int debug_lvl = 0;
42016 +module_param(log_stats, int, 0644);
42017 +module_param(debug_lvl, int, 0644);
42018 +
42019 +/*
42020 + * Each outstanding request that we've passed to the lower device layers has a 
42021 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
42022 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
42023 + * response queued for it, with the saved 'id' passed back.
42024 + */
42025 +typedef struct {
42026 +       blkif_t       *blkif;
42027 +       unsigned long  id;
42028 +       int            nr_pages;
42029 +       atomic_t       pendcnt;
42030 +       unsigned short operation;
42031 +       int            status;
42032 +       struct list_head free_list;
42033 +} pending_req_t;
42034 +
42035 +static pending_req_t *pending_reqs;
42036 +static struct list_head pending_free;
42037 +static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
42038 +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
42039 +
42040 +#define BLKBACK_INVALID_HANDLE (~0)
42041 +
42042 +static unsigned long mmap_vstart;
42043 +static unsigned long *pending_vaddrs;
42044 +static grant_handle_t *pending_grant_handles;
42045 +
42046 +static inline int vaddr_pagenr(pending_req_t *req, int seg)
42047 +{
42048 +       return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
42049 +}
42050 +
42051 +static inline unsigned long vaddr(pending_req_t *req, int seg)
42052 +{
42053 +       return pending_vaddrs[vaddr_pagenr(req, seg)];
42054 +}
42055 +
42056 +#define pending_handle(_req, _seg) \
42057 +       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
42058 +
42059 +
42060 +#ifdef CONFIG_XEN_BLKDEV_TAP_BE
42061 +/*
42062 + * If the tap driver is used, we may get pages belonging to either the tap
42063 + * or (more likely) the real frontend.  The backend must specify which domain
42064 + * a given page belongs to in update_va_mapping though.  For the moment, 
42065 + * the tap rewrites the ID field of the request to contain the request index
42066 + * and the id of the real front end domain.
42067 + */
42068 +#define BLKTAP_COOKIE 0xbeadfeed
42069 +static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
42070 +#endif
42071 +
42072 +static int do_block_io_op(blkif_t *blkif);
42073 +static void dispatch_rw_block_io(blkif_t *blkif,
42074 +                                blkif_request_t *req,
42075 +                                pending_req_t *pending_req);
42076 +static void make_response(blkif_t *blkif, unsigned long id, 
42077 +                          unsigned short op, int st);
42078 +
42079 +/******************************************************************
42080 + * misc small helpers
42081 + */
42082 +static pending_req_t* alloc_req(void)
42083 +{
42084 +       pending_req_t *req = NULL;
42085 +       unsigned long flags;
42086 +
42087 +       spin_lock_irqsave(&pending_free_lock, flags);
42088 +       if (!list_empty(&pending_free)) {
42089 +               req = list_entry(pending_free.next, pending_req_t, free_list);
42090 +               list_del(&req->free_list);
42091 +       }
42092 +       spin_unlock_irqrestore(&pending_free_lock, flags);
42093 +       return req;
42094 +}
42095 +
42096 +static void free_req(pending_req_t *req)
42097 +{
42098 +       unsigned long flags;
42099 +       int was_empty;
42100 +
42101 +       spin_lock_irqsave(&pending_free_lock, flags);
42102 +       was_empty = list_empty(&pending_free);
42103 +       list_add(&req->free_list, &pending_free);
42104 +       spin_unlock_irqrestore(&pending_free_lock, flags);
42105 +       if (was_empty)
42106 +               wake_up(&pending_free_wq);
42107 +}
42108 +
42109 +static void unplug_queue(blkif_t *blkif)
42110 +{
42111 +       if (blkif->plug == NULL)
42112 +               return;
42113 +       if (blkif->plug->unplug_fn)
42114 +               blkif->plug->unplug_fn(blkif->plug);
42115 +       blk_put_queue(blkif->plug);
42116 +       blkif->plug = NULL;
42117 +}
42118 +
42119 +static void plug_queue(blkif_t *blkif, struct bio *bio)
42120 +{
42121 +       request_queue_t *q = bdev_get_queue(bio->bi_bdev);
42122 +
42123 +       if (q == blkif->plug)
42124 +               return;
42125 +       unplug_queue(blkif);
42126 +       blk_get_queue(q);
42127 +       blkif->plug = q;
42128 +}
42129 +
42130 +static void fast_flush_area(pending_req_t *req)
42131 +{
42132 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42133 +       unsigned int i, invcount = 0;
42134 +       grant_handle_t handle;
42135 +       int ret;
42136 +
42137 +       for (i = 0; i < req->nr_pages; i++) {
42138 +               handle = pending_handle(req, i);
42139 +               if (handle == BLKBACK_INVALID_HANDLE)
42140 +                       continue;
42141 +               unmap[invcount].host_addr    = vaddr(req, i);
42142 +               unmap[invcount].dev_bus_addr = 0;
42143 +               unmap[invcount].handle       = handle;
42144 +               pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
42145 +               invcount++;
42146 +       }
42147 +
42148 +       ret = HYPERVISOR_grant_table_op(
42149 +               GNTTABOP_unmap_grant_ref, unmap, invcount);
42150 +       BUG_ON(ret);
42151 +}
42152 +
42153 +/******************************************************************
42154 + * SCHEDULER FUNCTIONS
42155 + */
42156 +
42157 +static void print_stats(blkif_t *blkif)
42158 +{
42159 +       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
42160 +              current->comm, blkif->st_oo_req,
42161 +              blkif->st_rd_req, blkif->st_wr_req);
42162 +       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
42163 +       blkif->st_rd_req = 0;
42164 +       blkif->st_wr_req = 0;
42165 +       blkif->st_oo_req = 0;
42166 +}
42167 +
42168 +int blkif_schedule(void *arg)
42169 +{
42170 +       blkif_t *blkif = arg;
42171 +
42172 +       blkif_get(blkif);
42173 +
42174 +       if (debug_lvl)
42175 +               printk(KERN_DEBUG "%s: started\n", current->comm);
42176 +
42177 +       /*
42178 +        * This thread may start before we are connected to the frontend
42179 +        * driver. In that case we must wait to be fully connected.
42180 +        */
42181 +       wait_event_interruptible(
42182 +               blkif->wq,
42183 +               blkif_connected(blkif) || kthread_should_stop());
42184 +
42185 +       while (!kthread_should_stop()) {
42186 +               wait_event_interruptible(
42187 +                       blkif->wq,
42188 +                       atomic_read(&blkif->io_pending) ||
42189 +                       kthread_should_stop());
42190 +               wait_event_interruptible(
42191 +                       pending_free_wq,
42192 +                       !list_empty(&pending_free) ||
42193 +                       kthread_should_stop());
42194 +
42195 +               atomic_set(&blkif->io_pending, 0);
42196 +               if (do_block_io_op(blkif))
42197 +                       atomic_inc(&blkif->io_pending);
42198 +               unplug_queue(blkif);
42199 +
42200 +               if (log_stats && time_after(jiffies, blkif->st_print))
42201 +                       print_stats(blkif);
42202 +       }
42203 +
42204 +       if (log_stats)
42205 +               print_stats(blkif);
42206 +       if (debug_lvl)
42207 +               printk(KERN_DEBUG "%s: exiting\n", current->comm);
42208 +
42209 +       blkif->xenblkd = NULL;
42210 +       blkif_put(blkif);
42211 +
42212 +       return 0;
42213 +}
42214 +
42215 +/******************************************************************
42216 + * COMPLETION CALLBACK -- Called as bh->b_end_io()
42217 + */
42218 +
42219 +static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
42220 +{
42221 +       /* An error fails the entire request. */
42222 +       if (!uptodate) {
42223 +               DPRINTK("Buffer not up-to-date at end of operation\n");
42224 +               pending_req->status = BLKIF_RSP_ERROR;
42225 +       }
42226 +
42227 +       if (atomic_dec_and_test(&pending_req->pendcnt)) {
42228 +               fast_flush_area(pending_req);
42229 +               make_response(pending_req->blkif, pending_req->id,
42230 +                             pending_req->operation, pending_req->status);
42231 +               blkif_put(pending_req->blkif);
42232 +               free_req(pending_req);
42233 +       }
42234 +}
42235 +
42236 +static int end_block_io_op(struct bio *bio, unsigned int done, int error)
42237 +{
42238 +       if (bio->bi_size != 0)
42239 +               return 1;
42240 +       __end_block_io_op(bio->bi_private, !error);
42241 +       bio_put(bio);
42242 +       return error;
42243 +}
42244 +
42245 +
42246 +/******************************************************************************
42247 + * NOTIFICATION FROM GUEST OS.
42248 + */
42249 +
42250 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
42251 +{
42252 +       blkif_t *blkif = dev_id;
42253 +
42254 +       atomic_inc(&blkif->io_pending);
42255 +       wake_up(&blkif->wq);
42256 +       return IRQ_HANDLED;
42257 +}
42258 +
42259 +
42260 +
42261 +/******************************************************************
42262 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
42263 + */
42264 +
42265 +static int do_block_io_op(blkif_t *blkif)
42266 +{
42267 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
42268 +       blkif_request_t *req;
42269 +       pending_req_t *pending_req;
42270 +       RING_IDX rc, rp;
42271 +       int more_to_do = 0;
42272 +
42273 +       rc = blk_ring->req_cons;
42274 +       rp = blk_ring->sring->req_prod;
42275 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
42276 +
42277 +       while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
42278 +
42279 +               pending_req = alloc_req();
42280 +               if (NULL == pending_req) {
42281 +                       blkif->st_oo_req++;
42282 +                       more_to_do = 1;
42283 +                       break;
42284 +               }
42285 +
42286 +               req = RING_GET_REQUEST(blk_ring, rc);
42287 +               blk_ring->req_cons = ++rc; /* before make_response() */
42288 +
42289 +               switch (req->operation) {
42290 +               case BLKIF_OP_READ:
42291 +                       blkif->st_rd_req++;
42292 +                       dispatch_rw_block_io(blkif, req, pending_req);
42293 +                       break;
42294 +               case BLKIF_OP_WRITE:
42295 +                       blkif->st_wr_req++;
42296 +                       dispatch_rw_block_io(blkif, req, pending_req);
42297 +                       break;
42298 +               default:
42299 +                       DPRINTK("error: unknown block io operation [%d]\n",
42300 +                               req->operation);
42301 +                       make_response(blkif, req->id, req->operation,
42302 +                                     BLKIF_RSP_ERROR);
42303 +                       free_req(pending_req);
42304 +                       break;
42305 +               }
42306 +       }
42307 +       return more_to_do;
42308 +}
42309 +
42310 +static void dispatch_rw_block_io(blkif_t *blkif,
42311 +                                blkif_request_t *req,
42312 +                                pending_req_t *pending_req)
42313 +{
42314 +       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
42315 +       int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
42316 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42317 +       struct phys_req preq;
42318 +       struct { 
42319 +               unsigned long buf; unsigned int nsec;
42320 +       } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42321 +       unsigned int nseg;
42322 +       struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42323 +       int ret, i, nbio = 0;
42324 +
42325 +       /* Check that number of segments is sane. */
42326 +       nseg = req->nr_segments;
42327 +       if (unlikely(nseg == 0) || 
42328 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
42329 +               DPRINTK("Bad number of segments in request (%d)\n", nseg);
42330 +               goto fail_response;
42331 +       }
42332 +
42333 +       preq.dev           = req->handle;
42334 +       preq.sector_number = req->sector_number;
42335 +       preq.nr_sects      = 0;
42336 +
42337 +       pending_req->blkif     = blkif;
42338 +       pending_req->id        = req->id;
42339 +       pending_req->operation = operation;
42340 +       pending_req->status    = BLKIF_RSP_OKAY;
42341 +       pending_req->nr_pages  = nseg;
42342 +
42343 +       for (i = 0; i < nseg; i++) {
42344 +               seg[i].nsec = req->seg[i].last_sect -
42345 +                       req->seg[i].first_sect + 1;
42346 +
42347 +               if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
42348 +                   (seg[i].nsec <= 0))
42349 +                       goto fail_response;
42350 +               preq.nr_sects += seg[i].nsec;
42351 +
42352 +               map[i].host_addr = vaddr(pending_req, i);
42353 +               map[i].dom = blkif->domid;
42354 +               map[i].ref = req->seg[i].gref;
42355 +               map[i].flags = GNTMAP_host_map;
42356 +               if ( operation == WRITE )
42357 +                       map[i].flags |= GNTMAP_readonly;
42358 +       }
42359 +
42360 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
42361 +       BUG_ON(ret);
42362 +
42363 +       for (i = 0; i < nseg; i++) {
42364 +               if (unlikely(map[i].status != 0)) {
42365 +                       DPRINTK("invalid buffer -- could not remap it\n");
42366 +                       goto fail_flush;
42367 +               }
42368 +
42369 +               pending_handle(pending_req, i) = map[i].handle;
42370 +#ifdef __ia64__
42371 +               pending_vaddrs[vaddr_pagenr(pending_req, i)] =
42372 +                       (unsigned long)gnttab_map_vaddr(map[i]);
42373 +#else
42374 +               set_phys_to_machine(__pa(vaddr(
42375 +                       pending_req, i)) >> PAGE_SHIFT,
42376 +                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
42377 +#endif
42378 +               seg[i].buf  = map[i].dev_bus_addr | 
42379 +                       (req->seg[i].first_sect << 9);
42380 +       }
42381 +
42382 +       if (vbd_translate(&preq, blkif, operation) != 0) {
42383 +               DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
42384 +                       operation == READ ? "read" : "write",
42385 +                       preq.sector_number,
42386 +                       preq.sector_number + preq.nr_sects, preq.dev); 
42387 +               goto fail_flush;
42388 +       }
42389 +
42390 +       for (i = 0; i < nseg; i++) {
42391 +               if (((int)preq.sector_number|(int)seg[i].nsec) &
42392 +                   ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
42393 +                       DPRINTK("Misaligned I/O request from domain %d",
42394 +                               blkif->domid);
42395 +                       goto fail_put_bio;
42396 +               }
42397 +
42398 +               while ((bio == NULL) ||
42399 +                      (bio_add_page(bio,
42400 +                                    virt_to_page(vaddr(pending_req, i)),
42401 +                                    seg[i].nsec << 9,
42402 +                                    seg[i].buf & ~PAGE_MASK) == 0)) {
42403 +                       bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
42404 +                       if (unlikely(bio == NULL))
42405 +                               goto fail_put_bio;
42406 +                
42407 +                       bio->bi_bdev    = preq.bdev;
42408 +                       bio->bi_private = pending_req;
42409 +                       bio->bi_end_io  = end_block_io_op;
42410 +                       bio->bi_sector  = preq.sector_number;
42411 +               }
42412 +
42413 +               preq.sector_number += seg[i].nsec;
42414 +       }
42415 +
42416 +       plug_queue(blkif, bio);
42417 +       atomic_set(&pending_req->pendcnt, nbio);
42418 +       blkif_get(blkif);
42419 +
42420 +       for (i = 0; i < nbio; i++)
42421 +               submit_bio(operation, biolist[i]);
42422 +
42423 +       return;
42424 +
42425 + fail_put_bio:
42426 +       for (i = 0; i < (nbio-1); i++)
42427 +               bio_put(biolist[i]);
42428 + fail_flush:
42429 +       fast_flush_area(pending_req);
42430 + fail_response:
42431 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
42432 +       free_req(pending_req);
42433 +} 
42434 +
42435 +
42436 +
42437 +/******************************************************************
42438 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
42439 + */
42440 +
42441 +
42442 +static void make_response(blkif_t *blkif, unsigned long id, 
42443 +                          unsigned short op, int st)
42444 +{
42445 +       blkif_response_t *resp;
42446 +       unsigned long     flags;
42447 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
42448 +       int more_to_do = 0;
42449 +       int notify;
42450 +
42451 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
42452 +
42453 +       /* Place on the response ring for the relevant domain. */ 
42454 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
42455 +       resp->id        = id;
42456 +       resp->operation = op;
42457 +       resp->status    = st;
42458 +       blk_ring->rsp_prod_pvt++;
42459 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
42460 +
42461 +       if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
42462 +               /*
42463 +                * Tail check for pending requests. Allows frontend to avoid
42464 +                * notifications if requests are already in flight (lower
42465 +                * overheads and promotes batching).
42466 +                */
42467 +               RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
42468 +
42469 +       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
42470 +               more_to_do = 1;
42471 +
42472 +       }
42473 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
42474 +
42475 +       if (more_to_do) {
42476 +               atomic_inc(&blkif->io_pending);
42477 +               wake_up(&blkif->wq);
42478 +       }
42479 +       if (notify)
42480 +               notify_remote_via_irq(blkif->irq);
42481 +}
42482 +
42483 +static int __init blkif_init(void)
42484 +{
42485 +       struct page *page;
42486 +       int i;
42487 +
42488 +       if (xen_init() < 0)
42489 +               return -ENODEV;
42490 +
42491 +       mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
42492 +       pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
42493 +                                       blkif_reqs, GFP_KERNEL);
42494 +       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
42495 +                                       mmap_pages, GFP_KERNEL);
42496 +       pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
42497 +                                       mmap_pages, GFP_KERNEL);
42498 +       if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
42499 +               kfree(pending_reqs);
42500 +               kfree(pending_grant_handles);
42501 +               kfree(pending_vaddrs);
42502 +               printk("%s: out of memory\n", __FUNCTION__);
42503 +               return -ENOMEM;
42504 +       }
42505 +
42506 +       blkif_interface_init();
42507 +       
42508 +#ifdef __ia64__
42509 +       extern unsigned long alloc_empty_foreign_map_page_range(
42510 +               unsigned long pages);
42511 +       mmap_vstart = (unsigned long)
42512 +               alloc_empty_foreign_map_page_range(mmap_pages);
42513 +#else /* ! ia64 */
42514 +       page = balloon_alloc_empty_page_range(mmap_pages);
42515 +       BUG_ON(page == NULL);
42516 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
42517 +#endif
42518 +       printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
42519 +              __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
42520 +       BUG_ON(mmap_vstart == 0);
42521 +       for (i = 0; i < mmap_pages; i++) {
42522 +               pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
42523 +               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
42524 +       }
42525 +
42526 +       memset(pending_reqs, 0, sizeof(pending_reqs));
42527 +       INIT_LIST_HEAD(&pending_free);
42528 +
42529 +       for (i = 0; i < blkif_reqs; i++)
42530 +               list_add_tail(&pending_reqs[i].free_list, &pending_free);
42531 +    
42532 +       blkif_xenbus_init();
42533 +       __unsafe(THIS_MODULE);
42534 +       return 0;
42535 +}
42536 +
42537 +module_init(blkif_init);
42538 +
42539 +static void blkif_exit(void)
42540 +{
42541 +       BUG();
42542 +}
42543 +
42544 +module_exit(blkif_exit);
42545 +
42546 +MODULE_LICENSE("Dual BSD/GPL");
42547 +
42548 +/*
42549 + * Local variables:
42550 + *  c-file-style: "linux"
42551 + *  indent-tabs-mode: t
42552 + *  c-indent-level: 8
42553 + *  c-basic-offset: 8
42554 + *  tab-width: 8
42555 + * End:
42556 + */
42557 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkback/common.h tmp-linux-2.6-xen.patch/drivers/xen/blkback/common.h
42558 --- ref-linux-2.6.16.9/drivers/xen/blkback/common.h     1970-01-01 01:00:00.000000000 +0100
42559 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkback/common.h        2006-04-10 00:05:52.000000000 +0200
42560 @@ -0,0 +1,150 @@
42561 +/* 
42562 + * This program is free software; you can redistribute it and/or
42563 + * modify it under the terms of the GNU General Public License version 2
42564 + * as published by the Free Software Foundation; or, when distributed
42565 + * separately from the Linux kernel or incorporated into other
42566 + * software packages, subject to the following license:
42567 + * 
42568 + * Permission is hereby granted, free of charge, to any person obtaining a copy
42569 + * of this source file (the "Software"), to deal in the Software without
42570 + * restriction, including without limitation the rights to use, copy, modify,
42571 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
42572 + * and to permit persons to whom the Software is furnished to do so, subject to
42573 + * the following conditions:
42574 + * 
42575 + * The above copyright notice and this permission notice shall be included in
42576 + * all copies or substantial portions of the Software.
42577 + * 
42578 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42579 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42580 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42581 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42582 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42583 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
42584 + * IN THE SOFTWARE.
42585 + */
42586 +
42587 +#ifndef __BLKIF__BACKEND__COMMON_H__
42588 +#define __BLKIF__BACKEND__COMMON_H__
42589 +
42590 +#include <linux/config.h>
42591 +#include <linux/version.h>
42592 +#include <linux/module.h>
42593 +#include <linux/interrupt.h>
42594 +#include <linux/slab.h>
42595 +#include <linux/blkdev.h>
42596 +#include <linux/vmalloc.h>
42597 +#include <asm/io.h>
42598 +#include <asm/setup.h>
42599 +#include <asm/pgalloc.h>
42600 +#include <xen/evtchn.h>
42601 +#include <asm/hypervisor.h>
42602 +#include <xen/interface/io/blkif.h>
42603 +#include <xen/interface/io/ring.h>
42604 +#include <xen/gnttab.h>
42605 +#include <xen/driver_util.h>
42606 +
42607 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
42608 +                                    __FILE__ , __LINE__ , ## _a )
42609 +
42610 +struct vbd {
42611 +       blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
42612 +       unsigned char  readonly;    /* Non-zero -> read-only */
42613 +       unsigned char  type;        /* VDISK_xxx */
42614 +       u32            pdevice;     /* phys device that this vbd maps to */
42615 +       struct block_device *bdev;
42616 +}; 
42617 +
42618 +struct backend_info; 
42619 +
42620 +typedef struct blkif_st {
42621 +       /* Unique identifier for this interface. */
42622 +       domid_t           domid;
42623 +       unsigned int      handle;
42624 +       /* Physical parameters of the comms window. */
42625 +       unsigned int      evtchn;
42626 +       unsigned int      irq;
42627 +       /* Comms information. */
42628 +       blkif_back_ring_t blk_ring;
42629 +       struct vm_struct *blk_ring_area;
42630 +       /* The VBD attached to this interface. */
42631 +       struct vbd        vbd;
42632 +       /* Back pointer to the backend_info. */
42633 +       struct backend_info *be; 
42634 +       /* Private fields. */
42635 +       enum { DISCONNECTED, CONNECTED } status;
42636 +#ifdef CONFIG_XEN_BLKDEV_TAP_BE
42637 +       /* Is this a blktap frontend */
42638 +       unsigned int     is_blktap;
42639 +#endif
42640 +       spinlock_t       blk_ring_lock;
42641 +       atomic_t         refcnt;
42642 +
42643 +       wait_queue_head_t   wq;
42644 +       struct task_struct  *xenblkd;
42645 +       atomic_t            io_pending;
42646 +       request_queue_t     *plug;
42647 +
42648 +       /* statistics */
42649 +       unsigned long       st_print;
42650 +       int                 st_rd_req;
42651 +       int                 st_wr_req;
42652 +       int                 st_oo_req;
42653 +
42654 +       struct work_struct free_work;
42655 +
42656 +       grant_handle_t shmem_handle;
42657 +       grant_ref_t    shmem_ref;
42658 +} blkif_t;
42659 +
42660 +blkif_t *alloc_blkif(domid_t domid);
42661 +void free_blkif_callback(blkif_t *blkif);
42662 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
42663 +
42664 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
42665 +#define blkif_put(_b)                                  \
42666 +       do {                                            \
42667 +               if (atomic_dec_and_test(&(_b)->refcnt)) \
42668 +                       free_blkif_callback(_b);        \
42669 +       } while (0)
42670 +
42671 +/* Create a vbd. */
42672 +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
42673 +              unsigned minor, int readonly);
42674 +void vbd_free(struct vbd *vbd);
42675 +
42676 +unsigned long vbd_size(struct vbd *vbd);
42677 +unsigned int vbd_info(struct vbd *vbd);
42678 +unsigned long vbd_secsize(struct vbd *vbd);
42679 +
42680 +struct phys_req {
42681 +       unsigned short       dev;
42682 +       unsigned short       nr_sects;
42683 +       struct block_device *bdev;
42684 +       blkif_sector_t       sector_number;
42685 +};
42686 +
42687 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); 
42688 +
42689 +void blkif_interface_init(void);
42690 +
42691 +void blkif_xenbus_init(void);
42692 +
42693 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
42694 +int blkif_schedule(void *arg);
42695 +
42696 +void update_blkif_status(blkif_t *blkif); 
42697 +
42698 +int blkif_connected(blkif_t *blkif);
42699 +
42700 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
42701 +
42702 +/*
42703 + * Local variables:
42704 + *  c-file-style: "linux"
42705 + *  indent-tabs-mode: t
42706 + *  c-indent-level: 8
42707 + *  c-basic-offset: 8
42708 + *  tab-width: 8
42709 + * End:
42710 + */
42711 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkback/interface.c tmp-linux-2.6-xen.patch/drivers/xen/blkback/interface.c
42712 --- ref-linux-2.6.16.9/drivers/xen/blkback/interface.c  1970-01-01 01:00:00.000000000 +0100
42713 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkback/interface.c     2006-04-10 00:05:52.000000000 +0200
42714 @@ -0,0 +1,188 @@
42715 +/******************************************************************************
42716 + * arch/xen/drivers/blkif/backend/interface.c
42717 + * 
42718 + * Block-device interface management.
42719 + * 
42720 + * Copyright (c) 2004, Keir Fraser
42721 + * 
42722 + * This program is free software; you can redistribute it and/or
42723 + * modify it under the terms of the GNU General Public License version 2
42724 + * as published by the Free Software Foundation; or, when distributed
42725 + * separately from the Linux kernel or incorporated into other
42726 + * software packages, subject to the following license:
42727 + * 
42728 + * Permission is hereby granted, free of charge, to any person obtaining a copy
42729 + * of this source file (the "Software"), to deal in the Software without
42730 + * restriction, including without limitation the rights to use, copy, modify,
42731 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
42732 + * and to permit persons to whom the Software is furnished to do so, subject to
42733 + * the following conditions:
42734 + * 
42735 + * The above copyright notice and this permission notice shall be included in
42736 + * all copies or substantial portions of the Software.
42737 + * 
42738 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42739 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42740 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42741 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42742 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42743 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
42744 + * IN THE SOFTWARE.
42745 + */
42746 +
42747 +#include "common.h"
42748 +#include <xen/evtchn.h>
42749 +
42750 +static kmem_cache_t *blkif_cachep;
42751 +
42752 +blkif_t *alloc_blkif(domid_t domid)
42753 +{
42754 +       blkif_t *blkif;
42755 +
42756 +       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
42757 +       if (!blkif)
42758 +               return ERR_PTR(-ENOMEM);
42759 +
42760 +       memset(blkif, 0, sizeof(*blkif));
42761 +       blkif->domid = domid;
42762 +       blkif->status = DISCONNECTED;
42763 +       spin_lock_init(&blkif->blk_ring_lock);
42764 +       atomic_set(&blkif->refcnt, 1);
42765 +       init_waitqueue_head(&blkif->wq);
42766 +       blkif->st_print = jiffies;
42767 +
42768 +       return blkif;
42769 +}
42770 +
42771 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
42772 +{
42773 +       struct gnttab_map_grant_ref op;
42774 +       int ret;
42775 +
42776 +       op.host_addr = (unsigned long)blkif->blk_ring_area->addr;
42777 +       op.flags     = GNTMAP_host_map;
42778 +       op.ref       = shared_page;
42779 +       op.dom       = blkif->domid;
42780 +
42781 +       lock_vm_area(blkif->blk_ring_area);
42782 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
42783 +       unlock_vm_area(blkif->blk_ring_area);
42784 +       BUG_ON(ret);
42785 +
42786 +       if (op.status) {
42787 +               DPRINTK(" Grant table operation failure !\n");
42788 +               return op.status;
42789 +       }
42790 +
42791 +       blkif->shmem_ref = shared_page;
42792 +       blkif->shmem_handle = op.handle;
42793 +
42794 +#ifdef __ia64__
42795 +       /* on some arch's, map_grant_ref behaves like mmap, in that the
42796 +        * passed address is a hint and a different address may be returned */
42797 +       blkif->blk_ring_area->addr = gnttab_map_vaddr(op);
42798 +#endif
42799 +
42800 +       return 0;
42801 +}
42802 +
42803 +static void unmap_frontend_page(blkif_t *blkif)
42804 +{
42805 +       struct gnttab_unmap_grant_ref op;
42806 +       int ret;
42807 +
42808 +       op.host_addr    = (unsigned long)blkif->blk_ring_area->addr;
42809 +       op.handle       = blkif->shmem_handle;
42810 +       op.dev_bus_addr = 0;
42811 +
42812 +       lock_vm_area(blkif->blk_ring_area);
42813 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
42814 +       unlock_vm_area(blkif->blk_ring_area);
42815 +       BUG_ON(ret);
42816 +}
42817 +
42818 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
42819 +{
42820 +       blkif_sring_t *sring;
42821 +       int err;
42822 +       evtchn_op_t op = {
42823 +               .cmd = EVTCHNOP_bind_interdomain,
42824 +               .u.bind_interdomain.remote_dom = blkif->domid,
42825 +               .u.bind_interdomain.remote_port = evtchn };
42826 +
42827 +       /* Already connected through? */
42828 +       if (blkif->irq)
42829 +               return 0;
42830 +
42831 +       if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
42832 +               return -ENOMEM;
42833 +
42834 +       err = map_frontend_page(blkif, shared_page);
42835 +       if (err) {
42836 +               free_vm_area(blkif->blk_ring_area);
42837 +               return err;
42838 +       }
42839 +
42840 +       err = HYPERVISOR_event_channel_op(&op);
42841 +       if (err) {
42842 +               unmap_frontend_page(blkif);
42843 +               free_vm_area(blkif->blk_ring_area);
42844 +               return err;
42845 +       }
42846 +
42847 +       blkif->evtchn = op.u.bind_interdomain.local_port;
42848 +
42849 +       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
42850 +       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
42851 +
42852 +       blkif->irq = bind_evtchn_to_irqhandler(
42853 +               blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
42854 +
42855 +       /* We're potentially connected now */
42856 +       update_blkif_status(blkif); 
42857 +
42858 +       return 0;
42859 +}
42860 +
42861 +static void free_blkif(void *arg)
42862 +{
42863 +       blkif_t *blkif = (blkif_t *)arg;
42864 +
42865 +       /* Already disconnected? */
42866 +       if (blkif->irq) {
42867 +               unbind_from_irqhandler(blkif->irq, blkif);
42868 +               blkif->irq = 0;
42869 +       }
42870 +
42871 +       vbd_free(&blkif->vbd);
42872 +
42873 +       if (blkif->blk_ring.sring) {
42874 +               unmap_frontend_page(blkif);
42875 +               free_vm_area(blkif->blk_ring_area);
42876 +               blkif->blk_ring.sring = NULL;
42877 +       }
42878 +
42879 +       kmem_cache_free(blkif_cachep, blkif);
42880 +}
42881 +
42882 +void free_blkif_callback(blkif_t *blkif)
42883 +{
42884 +       INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif);
42885 +       schedule_work(&blkif->free_work);
42886 +}
42887 +
42888 +void __init blkif_interface_init(void)
42889 +{
42890 +       blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
42891 +                                        0, 0, NULL, NULL);
42892 +}
42893 +
42894 +/*
42895 + * Local variables:
42896 + *  c-file-style: "linux"
42897 + *  indent-tabs-mode: t
42898 + *  c-indent-level: 8
42899 + *  c-basic-offset: 8
42900 + *  tab-width: 8
42901 + * End:
42902 + */
42903 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkback/Makefile tmp-linux-2.6-xen.patch/drivers/xen/blkback/Makefile
42904 --- ref-linux-2.6.16.9/drivers/xen/blkback/Makefile     1970-01-01 01:00:00.000000000 +0100
42905 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkback/Makefile        2006-04-10 00:05:52.000000000 +0200
42906 @@ -0,0 +1,3 @@
42907 +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
42908 +
42909 +blkbk-y        := blkback.o xenbus.o interface.o vbd.o
42910 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkback/vbd.c tmp-linux-2.6-xen.patch/drivers/xen/blkback/vbd.c
42911 --- ref-linux-2.6.16.9/drivers/xen/blkback/vbd.c        1970-01-01 01:00:00.000000000 +0100
42912 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkback/vbd.c   2006-04-10 00:05:52.000000000 +0200
42913 @@ -0,0 +1,126 @@
42914 +/******************************************************************************
42915 + * blkback/vbd.c
42916 + * 
42917 + * Routines for managing virtual block devices (VBDs).
42918 + * 
42919 + * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
42920 + * 
42921 + * This program is free software; you can redistribute it and/or
42922 + * modify it under the terms of the GNU General Public License version 2
42923 + * as published by the Free Software Foundation; or, when distributed
42924 + * separately from the Linux kernel or incorporated into other
42925 + * software packages, subject to the following license:
42926 + * 
42927 + * Permission is hereby granted, free of charge, to any person obtaining a copy
42928 + * of this source file (the "Software"), to deal in the Software without
42929 + * restriction, including without limitation the rights to use, copy, modify,
42930 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
42931 + * and to permit persons to whom the Software is furnished to do so, subject to
42932 + * the following conditions:
42933 + * 
42934 + * The above copyright notice and this permission notice shall be included in
42935 + * all copies or substantial portions of the Software.
42936 + * 
42937 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42938 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42939 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42940 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42941 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42942 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
42943 + * IN THE SOFTWARE.
42944 + */
42945 +
42946 +#include "common.h"
42947 +#include <xen/xenbus.h>
42948 +
42949 +#define vbd_sz(_v)   ((_v)->bdev->bd_part ?                            \
42950 +       (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
42951 +
42952 +unsigned long vbd_size(struct vbd *vbd)
42953 +{
42954 +       return vbd_sz(vbd);
42955 +}
42956 +
42957 +unsigned int vbd_info(struct vbd *vbd)
42958 +{
42959 +       return vbd->type | (vbd->readonly?VDISK_READONLY:0);
42960 +}
42961 +
42962 +unsigned long vbd_secsize(struct vbd *vbd)
42963 +{
42964 +       return bdev_hardsect_size(vbd->bdev);
42965 +}
42966 +
42967 +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
42968 +              unsigned minor, int readonly)
42969 +{
42970 +       struct vbd *vbd;
42971 +
42972 +       vbd = &blkif->vbd;
42973 +       vbd->handle   = handle; 
42974 +       vbd->readonly = readonly;
42975 +       vbd->type     = 0;
42976 +
42977 +       vbd->pdevice  = MKDEV(major, minor);
42978 +
42979 +       vbd->bdev = open_by_devnum(
42980 +               vbd->pdevice,
42981 +               vbd->readonly ? FMODE_READ : FMODE_WRITE);
42982 +       if (IS_ERR(vbd->bdev)) {
42983 +               DPRINTK("vbd_creat: device %08x doesn't exist.\n",
42984 +                       vbd->pdevice);
42985 +               return -ENOENT;
42986 +       }
42987 +
42988 +       if (vbd->bdev->bd_disk == NULL) {
42989 +               DPRINTK("vbd_creat: device %08x doesn't exist.\n",
42990 +                       vbd->pdevice);
42991 +               vbd_free(vbd);
42992 +               return -ENOENT;
42993 +       }
42994 +
42995 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_CD)
42996 +               vbd->type |= VDISK_CDROM;
42997 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
42998 +               vbd->type |= VDISK_REMOVABLE;
42999 +
43000 +       DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
43001 +               handle, blkif->domid);
43002 +       return 0;
43003 +}
43004 +
43005 +void vbd_free(struct vbd *vbd)
43006 +{
43007 +       if (vbd->bdev)
43008 +               blkdev_put(vbd->bdev);
43009 +       vbd->bdev = NULL;
43010 +}
43011 +
43012 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
43013 +{
43014 +       struct vbd *vbd = &blkif->vbd;
43015 +       int rc = -EACCES;
43016 +
43017 +       if ((operation == WRITE) && vbd->readonly)
43018 +               goto out;
43019 +
43020 +       if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
43021 +               goto out;
43022 +
43023 +       req->dev  = vbd->pdevice;
43024 +       req->bdev = vbd->bdev;
43025 +       rc = 0;
43026 +
43027 + out:
43028 +       return rc;
43029 +}
43030 +
43031 +/*
43032 + * Local variables:
43033 + *  c-file-style: "linux"
43034 + *  indent-tabs-mode: t
43035 + *  c-indent-level: 8
43036 + *  c-basic-offset: 8
43037 + *  tab-width: 8
43038 + * End:
43039 + */
43040 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkback/xenbus.c tmp-linux-2.6-xen.patch/drivers/xen/blkback/xenbus.c
43041 --- ref-linux-2.6.16.9/drivers/xen/blkback/xenbus.c     1970-01-01 01:00:00.000000000 +0100
43042 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkback/xenbus.c        2006-04-10 00:05:52.000000000 +0200
43043 @@ -0,0 +1,430 @@
43044 +/*  Xenbus code for blkif backend
43045 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
43046 +    Copyright (C) 2005 XenSource Ltd
43047 +
43048 +    This program is free software; you can redistribute it and/or modify
43049 +    it under the terms of the GNU General Public License as published by
43050 +    the Free Software Foundation; either version 2 of the License, or
43051 +    (at your option) any later version.
43052 +
43053 +    This program is distributed in the hope that it will be useful,
43054 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
43055 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
43056 +    GNU General Public License for more details.
43057 +
43058 +    You should have received a copy of the GNU General Public License
43059 +    along with this program; if not, write to the Free Software
43060 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
43061 +*/
43062 +
43063 +
43064 +#include <stdarg.h>
43065 +#include <linux/module.h>
43066 +#include <linux/kthread.h>
43067 +#include <xen/xenbus.h>
43068 +#include "common.h"
43069 +
43070 +#undef DPRINTK
43071 +#define DPRINTK(fmt, args...) \
43072 +    pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
43073 +
43074 +
43075 +struct backend_info
43076 +{
43077 +       struct xenbus_device *dev;
43078 +       blkif_t *blkif;
43079 +       struct xenbus_watch backend_watch;
43080 +
43081 +       unsigned major;
43082 +       unsigned minor;
43083 +       char *mode;
43084 +};
43085 +
43086 +
43087 +static void maybe_connect(struct backend_info *);
43088 +static void connect(struct backend_info *);
43089 +static int connect_ring(struct backend_info *);
43090 +static void backend_changed(struct xenbus_watch *, const char **,
43091 +                           unsigned int);
43092 +
43093 +int blkif_connected(blkif_t *blkif)
43094 +{
43095 +       return (blkif->be->dev->state == XenbusStateConnected);
43096 +}
43097 +
43098 +void update_blkif_status(blkif_t *blkif)
43099 +{ 
43100 +       if(blkif->irq && blkif->vbd.bdev) {
43101 +               blkif->status = CONNECTED; 
43102 +               (void)blkif_be_int(0, blkif, NULL); 
43103 +       }
43104 +       maybe_connect(blkif->be); 
43105 +}
43106 +
43107 +
43108 +static ssize_t show_physical_device(struct device *_dev,
43109 +                                   struct device_attribute *attr, char *buf)
43110 +{
43111 +       struct xenbus_device *dev = to_xenbus_device(_dev);
43112 +       struct backend_info *be = dev->data;
43113 +       return sprintf(buf, "%x:%x\n", be->major, be->minor);
43114 +}
43115 +DEVICE_ATTR(physical_device, S_IRUSR | S_IRGRP | S_IROTH,
43116 +           show_physical_device, NULL);
43117 +
43118 +
43119 +static ssize_t show_mode(struct device *_dev, struct device_attribute *attr,
43120 +                        char *buf)
43121 +{
43122 +       struct xenbus_device *dev = to_xenbus_device(_dev);
43123 +       struct backend_info *be = dev->data;
43124 +       return sprintf(buf, "%s\n", be->mode);
43125 +}
43126 +DEVICE_ATTR(mode, S_IRUSR | S_IRGRP | S_IROTH, show_mode, NULL);
43127 +
43128 +
43129 +static int blkback_remove(struct xenbus_device *dev)
43130 +{
43131 +       struct backend_info *be = dev->data;
43132 +
43133 +       DPRINTK("");
43134 +
43135 +       if (be->backend_watch.node) {
43136 +               unregister_xenbus_watch(&be->backend_watch);
43137 +               kfree(be->backend_watch.node);
43138 +               be->backend_watch.node = NULL;
43139 +       }
43140 +       if (be->blkif) {
43141 +               be->blkif->status = DISCONNECTED; 
43142 +               if (be->blkif->xenblkd)
43143 +                       kthread_stop(be->blkif->xenblkd);
43144 +               blkif_put(be->blkif);
43145 +               be->blkif = NULL;
43146 +       }
43147 +
43148 +       device_remove_file(&dev->dev, &dev_attr_physical_device);
43149 +       device_remove_file(&dev->dev, &dev_attr_mode);
43150 +
43151 +       kfree(be);
43152 +       dev->data = NULL;
43153 +       return 0;
43154 +}
43155 +
43156 +
43157 +/**
43158 + * Entry point to this code when a new device is created.  Allocate the basic
43159 + * structures, and watch the store waiting for the hotplug scripts to tell us
43160 + * the device's physical major and minor numbers.  Switch to InitWait.
43161 + */
43162 +static int blkback_probe(struct xenbus_device *dev,
43163 +                        const struct xenbus_device_id *id)
43164 +{
43165 +       int err;
43166 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
43167 +                                         GFP_KERNEL);
43168 +       if (!be) {
43169 +               xenbus_dev_fatal(dev, -ENOMEM,
43170 +                                "allocating backend structure");
43171 +               return -ENOMEM;
43172 +       }
43173 +       be->dev = dev;
43174 +       dev->data = be;
43175 +
43176 +       be->blkif = alloc_blkif(dev->otherend_id);
43177 +       if (IS_ERR(be->blkif)) {
43178 +               err = PTR_ERR(be->blkif);
43179 +               be->blkif = NULL;
43180 +               xenbus_dev_fatal(dev, err, "creating block interface");
43181 +               goto fail;
43182 +       }
43183 +
43184 +       /* setup back pointer */
43185 +       be->blkif->be = be; 
43186 +
43187 +       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
43188 +                                &be->backend_watch, backend_changed);
43189 +       if (err)
43190 +               goto fail;
43191 +
43192 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
43193 +       if (err)
43194 +               goto fail;
43195 +
43196 +       return 0;
43197 +
43198 +fail:
43199 +       DPRINTK("failed");
43200 +       blkback_remove(dev);
43201 +       return err;
43202 +}
43203 +
43204 +
43205 +/**
43206 + * Callback received when the hotplug scripts have placed the physical-device
43207 + * node.  Read it and the mode node, and create a vbd.  If the frontend is
43208 + * ready, connect.
43209 + */
43210 +static void backend_changed(struct xenbus_watch *watch,
43211 +                           const char **vec, unsigned int len)
43212 +{
43213 +       int err;
43214 +       unsigned major;
43215 +       unsigned minor;
43216 +       struct backend_info *be
43217 +               = container_of(watch, struct backend_info, backend_watch);
43218 +       struct xenbus_device *dev = be->dev;
43219 +
43220 +       DPRINTK("");
43221 +
43222 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "physical-device", "%x:%x",
43223 +                          &major, &minor);
43224 +       if (XENBUS_EXIST_ERR(err)) {
43225 +               /* Since this watch will fire once immediately after it is
43226 +                  registered, we expect this.  Ignore it, and wait for the
43227 +                  hotplug scripts. */
43228 +               return;
43229 +       }
43230 +       if (err != 2) {
43231 +               xenbus_dev_fatal(dev, err, "reading physical-device");
43232 +               return;
43233 +       }
43234 +
43235 +       if (be->major && be->minor &&
43236 +           (be->major != major || be->minor != minor)) {
43237 +               printk(KERN_WARNING
43238 +                      "blkback: changing physical device (from %x:%x to "
43239 +                      "%x:%x) not supported.\n", be->major, be->minor,
43240 +                      major, minor);
43241 +               return;
43242 +       }
43243 +
43244 +       be->mode = xenbus_read(XBT_NULL, dev->nodename, "mode", NULL);
43245 +       if (IS_ERR(be->mode)) {
43246 +               err = PTR_ERR(be->mode);
43247 +               be->mode = NULL;
43248 +               xenbus_dev_fatal(dev, err, "reading mode");
43249 +               return;
43250 +       }
43251 +
43252 +       if (be->major == 0 && be->minor == 0) {
43253 +               /* Front end dir is a number, which is used as the handle. */
43254 +
43255 +               char *p = strrchr(dev->otherend, '/') + 1;
43256 +               long handle = simple_strtoul(p, NULL, 0);
43257 +
43258 +               be->major = major;
43259 +               be->minor = minor;
43260 +
43261 +               err = vbd_create(be->blkif, handle, major, minor,
43262 +                                (NULL == strchr(be->mode, 'w')));
43263 +               if (err) {
43264 +                       be->major = 0;
43265 +                       be->minor = 0;
43266 +                       xenbus_dev_fatal(dev, err, "creating vbd structure");
43267 +                       return;
43268 +               }
43269 +
43270 +               be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
43271 +                                                "xvd %d %02x:%02x",
43272 +                                                be->blkif->domid,
43273 +                                                be->major, be->minor);
43274 +               if (IS_ERR(be->blkif->xenblkd)) {
43275 +                       err = PTR_ERR(be->blkif->xenblkd);
43276 +                       be->blkif->xenblkd = NULL;
43277 +                       xenbus_dev_error(dev, err, "start xenblkd");
43278 +                       return;
43279 +               }
43280 +
43281 +               device_create_file(&dev->dev, &dev_attr_physical_device);
43282 +               device_create_file(&dev->dev, &dev_attr_mode);
43283 +
43284 +               /* We're potentially connected now */
43285 +               update_blkif_status(be->blkif); 
43286 +       }
43287 +}
43288 +
43289 +
43290 +/**
43291 + * Callback received when the frontend's state changes.
43292 + */
43293 +static void frontend_changed(struct xenbus_device *dev,
43294 +                            XenbusState frontend_state)
43295 +{
43296 +       struct backend_info *be = dev->data;
43297 +       int err;
43298 +
43299 +       DPRINTK("");
43300 +
43301 +       switch (frontend_state) {
43302 +       case XenbusStateInitialising:
43303 +               break;
43304 +
43305 +       case XenbusStateInitialised:
43306 +       case XenbusStateConnected:
43307 +               /* Ensure we connect even when two watches fire in 
43308 +                  close successsion and we miss the intermediate value 
43309 +                  of frontend_state. */
43310 +               if (dev->state == XenbusStateConnected)
43311 +                       break;
43312 +
43313 +               err = connect_ring(be);
43314 +               if (err)
43315 +                       break;
43316 +               update_blkif_status(be->blkif);
43317 +               break;
43318 +
43319 +       case XenbusStateClosing:
43320 +               xenbus_switch_state(dev, XenbusStateClosing);
43321 +               break;
43322 +
43323 +       case XenbusStateClosed:
43324 +               device_unregister(&dev->dev);
43325 +               break;
43326 +
43327 +       case XenbusStateUnknown:
43328 +       case XenbusStateInitWait:
43329 +       default:
43330 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
43331 +                                frontend_state);
43332 +               break;
43333 +       }
43334 +}
43335 +
43336 +
43337 +/* ** Connection ** */
43338 +
43339 +
43340 +static void maybe_connect(struct backend_info *be)
43341 +{
43342 +       if ((be->major != 0 || be->minor != 0) &&
43343 +           be->blkif->status == CONNECTED)
43344 +               connect(be);
43345 +}
43346 +
43347 +
43348 +/**
43349 + * Write the physical details regarding the block device to the store, and
43350 + * switch to Connected state.
43351 + */
43352 +static void connect(struct backend_info *be)
43353 +{
43354 +       xenbus_transaction_t xbt;
43355 +       int err;
43356 +       struct xenbus_device *dev = be->dev;
43357 +
43358 +       DPRINTK("%s", dev->otherend);
43359 +
43360 +       /* Supply the information about the device the frontend needs */
43361 +again:
43362 +       err = xenbus_transaction_start(&xbt);
43363 +
43364 +       if (err) {
43365 +               xenbus_dev_fatal(dev, err, "starting transaction");
43366 +               return;
43367 +       }
43368 +
43369 +       err = xenbus_printf(xbt, dev->nodename, "sectors", "%lu",
43370 +                           vbd_size(&be->blkif->vbd));
43371 +       if (err) {
43372 +               xenbus_dev_fatal(dev, err, "writing %s/sectors",
43373 +                                dev->nodename);
43374 +               goto abort;
43375 +       }
43376 +
43377 +       /* FIXME: use a typename instead */
43378 +       err = xenbus_printf(xbt, dev->nodename, "info", "%u",
43379 +                           vbd_info(&be->blkif->vbd));
43380 +       if (err) {
43381 +               xenbus_dev_fatal(dev, err, "writing %s/info",
43382 +                                dev->nodename);
43383 +               goto abort;
43384 +       }
43385 +       err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
43386 +                           vbd_secsize(&be->blkif->vbd));
43387 +       if (err) {
43388 +               xenbus_dev_fatal(dev, err, "writing %s/sector-size",
43389 +                                dev->nodename);
43390 +               goto abort;
43391 +       }
43392 +
43393 +       err = xenbus_transaction_end(xbt, 0);
43394 +       if (err == -EAGAIN)
43395 +               goto again;
43396 +       if (err)
43397 +               xenbus_dev_fatal(dev, err, "ending transaction");
43398 +
43399 +       err = xenbus_switch_state(dev, XenbusStateConnected);
43400 +       if (err)
43401 +               xenbus_dev_fatal(dev, err, "switching to Connected state",
43402 +                                dev->nodename);
43403 +
43404 +       return;
43405 + abort:
43406 +       xenbus_transaction_end(xbt, 1);
43407 +}
43408 +
43409 +
43410 +static int connect_ring(struct backend_info *be)
43411 +{
43412 +       struct xenbus_device *dev = be->dev;
43413 +       unsigned long ring_ref;
43414 +       unsigned int evtchn;
43415 +       int err;
43416 +
43417 +       DPRINTK("%s", dev->otherend);
43418 +
43419 +       err = xenbus_gather(XBT_NULL, dev->otherend, "ring-ref", "%lu", &ring_ref,
43420 +                           "event-channel", "%u", &evtchn, NULL);
43421 +       if (err) {
43422 +               xenbus_dev_fatal(dev, err,
43423 +                                "reading %s/ring-ref and event-channel",
43424 +                                dev->otherend);
43425 +               return err;
43426 +       }
43427 +
43428 +       /* Map the shared frame, irq etc. */
43429 +       err = blkif_map(be->blkif, ring_ref, evtchn);
43430 +       if (err) {
43431 +               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
43432 +                                ring_ref, evtchn);
43433 +               return err;
43434 +       }
43435 +
43436 +       return 0;
43437 +}
43438 +
43439 +
43440 +/* ** Driver Registration ** */
43441 +
43442 +
43443 +static struct xenbus_device_id blkback_ids[] = {
43444 +       { "vbd" },
43445 +       { "" }
43446 +};
43447 +
43448 +
43449 +static struct xenbus_driver blkback = {
43450 +       .name = "vbd",
43451 +       .owner = THIS_MODULE,
43452 +       .ids = blkback_ids,
43453 +       .probe = blkback_probe,
43454 +       .remove = blkback_remove,
43455 +       .otherend_changed = frontend_changed
43456 +};
43457 +
43458 +
43459 +void blkif_xenbus_init(void)
43460 +{
43461 +       xenbus_register_backend(&blkback);
43462 +}
43463 +
43464 +
43465 +/*
43466 + * Local variables:
43467 + *  c-file-style: "linux"
43468 + *  indent-tabs-mode: t
43469 + *  c-indent-level: 8
43470 + *  c-basic-offset: 8
43471 + *  tab-width: 8
43472 + * End:
43473 + */
43474 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkfront/blkfront.c tmp-linux-2.6-xen.patch/drivers/xen/blkfront/blkfront.c
43475 --- ref-linux-2.6.16.9/drivers/xen/blkfront/blkfront.c  1970-01-01 01:00:00.000000000 +0100
43476 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkfront/blkfront.c     2006-04-10 00:05:52.000000000 +0200
43477 @@ -0,0 +1,819 @@
43478 +/******************************************************************************
43479 + * blkfront.c
43480 + * 
43481 + * XenLinux virtual block-device driver.
43482 + * 
43483 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
43484 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
43485 + * Copyright (c) 2004, Christian Limpach
43486 + * Copyright (c) 2004, Andrew Warfield
43487 + * Copyright (c) 2005, Christopher Clark
43488 + * Copyright (c) 2005, XenSource Ltd
43489 + * 
43490 + * This program is free software; you can redistribute it and/or
43491 + * modify it under the terms of the GNU General Public License version 2
43492 + * as published by the Free Software Foundation; or, when distributed
43493 + * separately from the Linux kernel or incorporated into other
43494 + * software packages, subject to the following license:
43495 + * 
43496 + * Permission is hereby granted, free of charge, to any person obtaining a copy
43497 + * of this source file (the "Software"), to deal in the Software without
43498 + * restriction, including without limitation the rights to use, copy, modify,
43499 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
43500 + * and to permit persons to whom the Software is furnished to do so, subject to
43501 + * the following conditions:
43502 + * 
43503 + * The above copyright notice and this permission notice shall be included in
43504 + * all copies or substantial portions of the Software.
43505 + * 
43506 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43507 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43508 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43509 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43510 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
43511 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
43512 + * IN THE SOFTWARE.
43513 + */
43514 +
43515 +#include <linux/version.h>
43516 +#include "block.h"
43517 +#include <linux/cdrom.h>
43518 +#include <linux/sched.h>
43519 +#include <linux/interrupt.h>
43520 +#include <scsi/scsi.h>
43521 +#include <xen/evtchn.h>
43522 +#include <xen/xenbus.h>
43523 +#include <xen/interface/grant_table.h>
43524 +#include <xen/gnttab.h>
43525 +#include <asm/hypervisor.h>
43526 +
43527 +#define BLKIF_STATE_DISCONNECTED 0
43528 +#define BLKIF_STATE_CONNECTED    1
43529 +#define BLKIF_STATE_SUSPENDED    2
43530 +
43531 +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
43532 +    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
43533 +#define GRANT_INVALID_REF      0
43534 +
43535 +static void connect(struct blkfront_info *);
43536 +static void blkfront_closing(struct xenbus_device *);
43537 +static int blkfront_remove(struct xenbus_device *);
43538 +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
43539 +static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
43540 +
43541 +static void kick_pending_request_queues(struct blkfront_info *);
43542 +
43543 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
43544 +static void blkif_restart_queue(void *arg);
43545 +static void blkif_recover(struct blkfront_info *);
43546 +static void blkif_completion(struct blk_shadow *);
43547 +static void blkif_free(struct blkfront_info *, int);
43548 +
43549 +
43550 +/**
43551 + * Entry point to this code when a new device is created.  Allocate the basic
43552 + * structures and the ring buffer for communication with the backend, and
43553 + * inform the backend of the appropriate details for those.  Switch to
43554 + * Initialised state.
43555 + */
43556 +static int blkfront_probe(struct xenbus_device *dev,
43557 +                         const struct xenbus_device_id *id)
43558 +{
43559 +       int err, vdevice, i;
43560 +       struct blkfront_info *info;
43561 +
43562 +       /* FIXME: Use dynamic device id if this is not set. */
43563 +       err = xenbus_scanf(XBT_NULL, dev->nodename,
43564 +                          "virtual-device", "%i", &vdevice);
43565 +       if (err != 1) {
43566 +               xenbus_dev_fatal(dev, err, "reading virtual-device");
43567 +               return err;
43568 +       }
43569 +
43570 +       info = kzalloc(sizeof(*info), GFP_KERNEL);
43571 +       if (!info) {
43572 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
43573 +               return -ENOMEM;
43574 +       }
43575 +
43576 +       info->xbdev = dev;
43577 +       info->vdevice = vdevice;
43578 +       info->connected = BLKIF_STATE_DISCONNECTED;
43579 +       INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
43580 +
43581 +       for (i = 0; i < BLK_RING_SIZE; i++)
43582 +               info->shadow[i].req.id = i+1;
43583 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
43584 +
43585 +       /* Front end dir is a number, which is used as the id. */
43586 +       info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
43587 +       dev->data = info;
43588 +
43589 +       err = talk_to_backend(dev, info);
43590 +       if (err) {
43591 +               kfree(info);
43592 +               dev->data = NULL;
43593 +               return err;
43594 +       }
43595 +
43596 +       return 0;
43597 +}
43598 +
43599 +
43600 +/**
43601 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
43602 + * driver restart.  We tear down our blkif structure and recreate it, but
43603 + * leave the device-layer structures intact so that this is transparent to the
43604 + * rest of the kernel.
43605 + */
43606 +static int blkfront_resume(struct xenbus_device *dev)
43607 +{
43608 +       struct blkfront_info *info = dev->data;
43609 +       int err;
43610 +
43611 +       DPRINTK("blkfront_resume: %s\n", dev->nodename);
43612 +
43613 +       blkif_free(info, 1);
43614 +
43615 +       err = talk_to_backend(dev, info);
43616 +       if (!err)
43617 +               blkif_recover(info);
43618 +
43619 +       return err;
43620 +}
43621 +
43622 +
43623 +/* Common code used when first setting up, and when resuming. */
43624 +static int talk_to_backend(struct xenbus_device *dev,
43625 +                          struct blkfront_info *info)
43626 +{
43627 +       const char *message = NULL;
43628 +       xenbus_transaction_t xbt;
43629 +       int err;
43630 +
43631 +       /* Create shared ring, alloc event channel. */
43632 +       err = setup_blkring(dev, info);
43633 +       if (err)
43634 +               goto out;
43635 +
43636 +again:
43637 +       err = xenbus_transaction_start(&xbt);
43638 +       if (err) {
43639 +               xenbus_dev_fatal(dev, err, "starting transaction");
43640 +               goto destroy_blkring;
43641 +       }
43642 +
43643 +       err = xenbus_printf(xbt, dev->nodename,
43644 +                           "ring-ref","%u", info->ring_ref);
43645 +       if (err) {
43646 +               message = "writing ring-ref";
43647 +               goto abort_transaction;
43648 +       }
43649 +       err = xenbus_printf(xbt, dev->nodename,
43650 +                           "event-channel", "%u", info->evtchn);
43651 +       if (err) {
43652 +               message = "writing event-channel";
43653 +               goto abort_transaction;
43654 +       }
43655 +
43656 +       err = xenbus_transaction_end(xbt, 0);
43657 +       if (err) {
43658 +               if (err == -EAGAIN)
43659 +                       goto again;
43660 +               xenbus_dev_fatal(dev, err, "completing transaction");
43661 +               goto destroy_blkring;
43662 +       }
43663 +
43664 +       xenbus_switch_state(dev, XenbusStateInitialised);
43665 +
43666 +       return 0;
43667 +
43668 + abort_transaction:
43669 +       xenbus_transaction_end(xbt, 1);
43670 +       if (message)
43671 +               xenbus_dev_fatal(dev, err, "%s", message);
43672 + destroy_blkring:
43673 +       blkif_free(info, 0);
43674 + out:
43675 +       return err;
43676 +}
43677 +
43678 +
43679 +static int setup_blkring(struct xenbus_device *dev,
43680 +                        struct blkfront_info *info)
43681 +{
43682 +       blkif_sring_t *sring;
43683 +       int err;
43684 +
43685 +       info->ring_ref = GRANT_INVALID_REF;
43686 +
43687 +       sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
43688 +       if (!sring) {
43689 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
43690 +               return -ENOMEM;
43691 +       }
43692 +       SHARED_RING_INIT(sring);
43693 +       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
43694 +
43695 +       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
43696 +       if (err < 0) {
43697 +               free_page((unsigned long)sring);
43698 +               info->ring.sring = NULL;
43699 +               goto fail;
43700 +       }
43701 +       info->ring_ref = err;
43702 +
43703 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
43704 +       if (err)
43705 +               goto fail;
43706 +
43707 +       err = bind_evtchn_to_irqhandler(
43708 +               info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
43709 +       if (err <= 0) {
43710 +               xenbus_dev_fatal(dev, err,
43711 +                                "bind_evtchn_to_irqhandler failed");
43712 +               goto fail;
43713 +       }
43714 +       info->irq = err;
43715 +
43716 +       return 0;
43717 +fail:
43718 +       blkif_free(info, 0);
43719 +       return err;
43720 +}
43721 +
43722 +
43723 +/**
43724 + * Callback received when the backend's state changes.
43725 + */
43726 +static void backend_changed(struct xenbus_device *dev,
43727 +                           XenbusState backend_state)
43728 +{
43729 +       struct blkfront_info *info = dev->data;
43730 +       struct block_device *bd;
43731 +
43732 +       DPRINTK("blkfront:backend_changed.\n");
43733 +
43734 +       switch (backend_state) {
43735 +       case XenbusStateUnknown:
43736 +       case XenbusStateInitialising:
43737 +       case XenbusStateInitWait:
43738 +       case XenbusStateInitialised:
43739 +       case XenbusStateClosed:
43740 +               break;
43741 +
43742 +       case XenbusStateConnected:
43743 +               connect(info);
43744 +               break;
43745 +
43746 +       case XenbusStateClosing:
43747 +               bd = bdget(info->dev);
43748 +               if (bd == NULL)
43749 +                       xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
43750 +
43751 +               down(&bd->bd_sem);
43752 +               if (info->users > 0)
43753 +                       xenbus_dev_error(dev, -EBUSY,
43754 +                                        "Device in use; refusing to close");
43755 +               else
43756 +                       blkfront_closing(dev);
43757 +               up(&bd->bd_sem);
43758 +               bdput(bd);
43759 +               break;
43760 +       }
43761 +}
43762 +
43763 +
43764 +/* ** Connection ** */
43765 +
43766 +
43767 +/*
43768 + * Invoked when the backend is finally 'ready' (and has told produced
43769 + * the details about the physical device - #sectors, size, etc).
43770 + */
43771 +static void connect(struct blkfront_info *info)
43772 +{
43773 +       unsigned long sectors, sector_size;
43774 +       unsigned int binfo;
43775 +       int err;
43776 +
43777 +       if ((info->connected == BLKIF_STATE_CONNECTED) ||
43778 +           (info->connected == BLKIF_STATE_SUSPENDED) )
43779 +               return;
43780 +
43781 +       DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
43782 +
43783 +       err = xenbus_gather(XBT_NULL, info->xbdev->otherend,
43784 +                           "sectors", "%lu", &sectors,
43785 +                           "info", "%u", &binfo,
43786 +                           "sector-size", "%lu", &sector_size,
43787 +                           NULL);
43788 +       if (err) {
43789 +               xenbus_dev_fatal(info->xbdev, err,
43790 +                                "reading backend fields at %s",
43791 +                                info->xbdev->otherend);
43792 +               return;
43793 +       }
43794 +
43795 +       err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
43796 +       if (err) {
43797 +               xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
43798 +                                info->xbdev->otherend);
43799 +               return;
43800 +       }
43801 +
43802 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
43803 +
43804 +       /* Kick pending requests. */
43805 +       spin_lock_irq(&blkif_io_lock);
43806 +       info->connected = BLKIF_STATE_CONNECTED;
43807 +       kick_pending_request_queues(info);
43808 +       spin_unlock_irq(&blkif_io_lock);
43809 +
43810 +       add_disk(info->gd);
43811 +}
43812 +
43813 +/**
43814 + * Handle the change of state of the backend to Closing.  We must delete our
43815 + * device-layer structures now, to ensure that writes are flushed through to
43816 + * the backend.  Once is this done, we can switch to Closed in
43817 + * acknowledgement.
43818 + */
43819 +static void blkfront_closing(struct xenbus_device *dev)
43820 +{
43821 +       struct blkfront_info *info = dev->data;
43822 +
43823 +       DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
43824 +
43825 +       xlvbd_del(info);
43826 +
43827 +       xenbus_switch_state(dev, XenbusStateClosed);
43828 +}
43829 +
43830 +
43831 +static int blkfront_remove(struct xenbus_device *dev)
43832 +{
43833 +       struct blkfront_info *info = dev->data;
43834 +
43835 +       DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
43836 +
43837 +       blkif_free(info, 0);
43838 +
43839 +       kfree(info);
43840 +
43841 +       return 0;
43842 +}
43843 +
43844 +
43845 +static inline int GET_ID_FROM_FREELIST(
43846 +       struct blkfront_info *info)
43847 +{
43848 +       unsigned long free = info->shadow_free;
43849 +       BUG_ON(free > BLK_RING_SIZE);
43850 +       info->shadow_free = info->shadow[free].req.id;
43851 +       info->shadow[free].req.id = 0x0fffffee; /* debug */
43852 +       return free;
43853 +}
43854 +
43855 +static inline void ADD_ID_TO_FREELIST(
43856 +       struct blkfront_info *info, unsigned long id)
43857 +{
43858 +       info->shadow[id].req.id  = info->shadow_free;
43859 +       info->shadow[id].request = 0;
43860 +       info->shadow_free = id;
43861 +}
43862 +
43863 +static inline void flush_requests(struct blkfront_info *info)
43864 +{
43865 +       int notify;
43866 +
43867 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
43868 +
43869 +       if (notify)
43870 +               notify_remote_via_irq(info->irq);
43871 +}
43872 +
43873 +static void kick_pending_request_queues(struct blkfront_info *info)
43874 +{
43875 +       if (!RING_FULL(&info->ring)) {
43876 +               /* Re-enable calldowns. */
43877 +               blk_start_queue(info->rq);
43878 +               /* Kick things off immediately. */
43879 +               do_blkif_request(info->rq);
43880 +       }
43881 +}
43882 +
43883 +static void blkif_restart_queue(void *arg)
43884 +{
43885 +       struct blkfront_info *info = (struct blkfront_info *)arg;
43886 +       spin_lock_irq(&blkif_io_lock);
43887 +       kick_pending_request_queues(info);
43888 +       spin_unlock_irq(&blkif_io_lock);
43889 +}
43890 +
43891 +static void blkif_restart_queue_callback(void *arg)
43892 +{
43893 +       struct blkfront_info *info = (struct blkfront_info *)arg;
43894 +       schedule_work(&info->work);
43895 +}
43896 +
43897 +int blkif_open(struct inode *inode, struct file *filep)
43898 +{
43899 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
43900 +       info->users++;
43901 +       return 0;
43902 +}
43903 +
43904 +
43905 +int blkif_release(struct inode *inode, struct file *filep)
43906 +{
43907 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
43908 +       info->users--;
43909 +       if (info->users == 0) {
43910 +               /* Check whether we have been instructed to close.  We will
43911 +                  have ignored this request initially, as the device was
43912 +                  still mounted. */
43913 +               struct xenbus_device * dev = info->xbdev;
43914 +               XenbusState state = xenbus_read_driver_state(dev->otherend);
43915 +
43916 +               if (state == XenbusStateClosing)
43917 +                       blkfront_closing(dev);
43918 +       }
43919 +       return 0;
43920 +}
43921 +
43922 +
43923 +int blkif_ioctl(struct inode *inode, struct file *filep,
43924 +                unsigned command, unsigned long argument)
43925 +{
43926 +       int i;
43927 +
43928 +       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
43929 +                     command, (long)argument, inode->i_rdev);
43930 +
43931 +       switch (command) {
43932 +       case HDIO_GETGEO:
43933 +               /* return ENOSYS to use defaults */
43934 +               return -ENOSYS;
43935 +
43936 +       case CDROMMULTISESSION:
43937 +               DPRINTK("FIXME: support multisession CDs later\n");
43938 +               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
43939 +                       if (put_user(0, (char __user *)(argument + i)))
43940 +                               return -EFAULT;
43941 +               return 0;
43942 +
43943 +       default:
43944 +               /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
43945 +                 command);*/
43946 +               return -EINVAL; /* same return as native Linux */
43947 +       }
43948 +
43949 +       return 0;
43950 +}
43951 +
43952 +
43953 +/*
43954 + * blkif_queue_request
43955 + *
43956 + * request block io
43957 + *
43958 + * id: for guest use only.
43959 + * operation: BLKIF_OP_{READ,WRITE,PROBE}
43960 + * buffer: buffer to read/write into. this should be a
43961 + *   virtual address in the guest os.
43962 + */
43963 +static int blkif_queue_request(struct request *req)
43964 +{
43965 +       struct blkfront_info *info = req->rq_disk->private_data;
43966 +       unsigned long buffer_mfn;
43967 +       blkif_request_t *ring_req;
43968 +       struct bio *bio;
43969 +       struct bio_vec *bvec;
43970 +       int idx;
43971 +       unsigned long id;
43972 +       unsigned int fsect, lsect;
43973 +       int ref;
43974 +       grant_ref_t gref_head;
43975 +
43976 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
43977 +               return 1;
43978 +
43979 +       if (gnttab_alloc_grant_references(
43980 +               BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
43981 +               gnttab_request_free_callback(
43982 +                       &info->callback,
43983 +                       blkif_restart_queue_callback,
43984 +                       info,
43985 +                       BLKIF_MAX_SEGMENTS_PER_REQUEST);
43986 +               return 1;
43987 +       }
43988 +
43989 +       /* Fill out a communications ring structure. */
43990 +       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
43991 +       id = GET_ID_FROM_FREELIST(info);
43992 +       info->shadow[id].request = (unsigned long)req;
43993 +
43994 +       ring_req->id = id;
43995 +       ring_req->operation = rq_data_dir(req) ?
43996 +               BLKIF_OP_WRITE : BLKIF_OP_READ;
43997 +       ring_req->sector_number = (blkif_sector_t)req->sector;
43998 +       ring_req->handle = info->handle;
43999 +
44000 +       ring_req->nr_segments = 0;
44001 +       rq_for_each_bio (bio, req) {
44002 +               bio_for_each_segment (bvec, bio, idx) {
44003 +                       BUG_ON(ring_req->nr_segments
44004 +                              == BLKIF_MAX_SEGMENTS_PER_REQUEST);
44005 +                       buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
44006 +                       fsect = bvec->bv_offset >> 9;
44007 +                       lsect = fsect + (bvec->bv_len >> 9) - 1;
44008 +                       /* install a grant reference. */
44009 +                       ref = gnttab_claim_grant_reference(&gref_head);
44010 +                       BUG_ON(ref == -ENOSPC);
44011 +
44012 +                       gnttab_grant_foreign_access_ref(
44013 +                               ref,
44014 +                               info->xbdev->otherend_id,
44015 +                               buffer_mfn,
44016 +                               rq_data_dir(req) );
44017 +
44018 +                       info->shadow[id].frame[ring_req->nr_segments] =
44019 +                               mfn_to_pfn(buffer_mfn);
44020 +
44021 +                       ring_req->seg[ring_req->nr_segments] =
44022 +                               (struct blkif_request_segment) {
44023 +                                       .gref       = ref,
44024 +                                       .first_sect = fsect,
44025 +                                       .last_sect  = lsect };
44026 +
44027 +                       ring_req->nr_segments++;
44028 +               }
44029 +       }
44030 +
44031 +       info->ring.req_prod_pvt++;
44032 +
44033 +       /* Keep a private copy so we can reissue requests when recovering. */
44034 +       info->shadow[id].req = *ring_req;
44035 +
44036 +       gnttab_free_grant_references(gref_head);
44037 +
44038 +       return 0;
44039 +}
44040 +
44041 +/*
44042 + * do_blkif_request
44043 + *  read a block; request is in a request queue
44044 + */
44045 +void do_blkif_request(request_queue_t *rq)
44046 +{
44047 +       struct blkfront_info *info = NULL;
44048 +       struct request *req;
44049 +       int queued;
44050 +
44051 +       DPRINTK("Entered do_blkif_request\n");
44052 +
44053 +       queued = 0;
44054 +
44055 +       while ((req = elv_next_request(rq)) != NULL) {
44056 +               info = req->rq_disk->private_data;
44057 +               if (!blk_fs_request(req)) {
44058 +                       end_request(req, 0);
44059 +                       continue;
44060 +               }
44061 +
44062 +               if (RING_FULL(&info->ring))
44063 +                       goto wait;
44064 +
44065 +               DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
44066 +                       "(%u/%li) buffer:%p [%s]\n",
44067 +                       req, req->cmd, req->sector, req->current_nr_sectors,
44068 +                       req->nr_sectors, req->buffer,
44069 +                       rq_data_dir(req) ? "write" : "read");
44070 +
44071 +
44072 +               blkdev_dequeue_request(req);
44073 +               if (blkif_queue_request(req)) {
44074 +                       blk_requeue_request(rq, req);
44075 +               wait:
44076 +                       /* Avoid pointless unplugs. */
44077 +                       blk_stop_queue(rq);
44078 +                       break;
44079 +               }
44080 +
44081 +               queued++;
44082 +       }
44083 +
44084 +       if (queued != 0)
44085 +               flush_requests(info);
44086 +}
44087 +
44088 +
44089 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
44090 +{
44091 +       struct request *req;
44092 +       blkif_response_t *bret;
44093 +       RING_IDX i, rp;
44094 +       unsigned long flags;
44095 +       struct blkfront_info *info = (struct blkfront_info *)dev_id;
44096 +
44097 +       spin_lock_irqsave(&blkif_io_lock, flags);
44098 +
44099 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
44100 +               spin_unlock_irqrestore(&blkif_io_lock, flags);
44101 +               return IRQ_HANDLED;
44102 +       }
44103 +
44104 + again:
44105 +       rp = info->ring.sring->rsp_prod;
44106 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
44107 +
44108 +       for (i = info->ring.rsp_cons; i != rp; i++) {
44109 +               unsigned long id;
44110 +               int ret;
44111 +
44112 +               bret = RING_GET_RESPONSE(&info->ring, i);
44113 +               id   = bret->id;
44114 +               req  = (struct request *)info->shadow[id].request;
44115 +
44116 +               blkif_completion(&info->shadow[id]);
44117 +
44118 +               ADD_ID_TO_FREELIST(info, id);
44119 +
44120 +               switch (bret->operation) {
44121 +               case BLKIF_OP_READ:
44122 +               case BLKIF_OP_WRITE:
44123 +                       if (unlikely(bret->status != BLKIF_RSP_OKAY))
44124 +                               DPRINTK("Bad return from blkdev data "
44125 +                                       "request: %x\n", bret->status);
44126 +
44127 +                       ret = end_that_request_first(
44128 +                               req, (bret->status == BLKIF_RSP_OKAY),
44129 +                               req->hard_nr_sectors);
44130 +                       BUG_ON(ret);
44131 +                       end_that_request_last(
44132 +                               req, (bret->status == BLKIF_RSP_OKAY));
44133 +                       break;
44134 +               default:
44135 +                       BUG();
44136 +               }
44137 +       }
44138 +
44139 +       info->ring.rsp_cons = i;
44140 +
44141 +       if (i != info->ring.req_prod_pvt) {
44142 +               int more_to_do;
44143 +               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
44144 +               if (more_to_do)
44145 +                       goto again;
44146 +       } else
44147 +               info->ring.sring->rsp_event = i + 1;
44148 +
44149 +       kick_pending_request_queues(info);
44150 +
44151 +       spin_unlock_irqrestore(&blkif_io_lock, flags);
44152 +
44153 +       return IRQ_HANDLED;
44154 +}
44155 +
44156 +static void blkif_free(struct blkfront_info *info, int suspend)
44157 +{
44158 +       /* Prevent new requests being issued until we fix things up. */
44159 +       spin_lock_irq(&blkif_io_lock);
44160 +       info->connected = suspend ?
44161 +               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
44162 +       spin_unlock_irq(&blkif_io_lock);
44163 +
44164 +       /* Free resources associated with old device channel. */
44165 +       if (info->ring_ref != GRANT_INVALID_REF) {
44166 +               gnttab_end_foreign_access(info->ring_ref, 0,
44167 +                                         (unsigned long)info->ring.sring);
44168 +               info->ring_ref = GRANT_INVALID_REF;
44169 +               info->ring.sring = NULL;
44170 +       }
44171 +       if (info->irq)
44172 +               unbind_from_irqhandler(info->irq, info);
44173 +       info->evtchn = info->irq = 0;
44174 +
44175 +}
44176 +
44177 +static void blkif_completion(struct blk_shadow *s)
44178 +{
44179 +       int i;
44180 +       for (i = 0; i < s->req.nr_segments; i++)
44181 +               gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
44182 +}
44183 +
44184 +static void blkif_recover(struct blkfront_info *info)
44185 +{
44186 +       int i;
44187 +       blkif_request_t *req;
44188 +       struct blk_shadow *copy;
44189 +       int j;
44190 +
44191 +       /* Stage 1: Make a safe copy of the shadow state. */
44192 +       copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
44193 +       memcpy(copy, info->shadow, sizeof(info->shadow));
44194 +
44195 +       /* Stage 2: Set up free list. */
44196 +       memset(&info->shadow, 0, sizeof(info->shadow));
44197 +       for (i = 0; i < BLK_RING_SIZE; i++)
44198 +               info->shadow[i].req.id = i+1;
44199 +       info->shadow_free = info->ring.req_prod_pvt;
44200 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
44201 +
44202 +       /* Stage 3: Find pending requests and requeue them. */
44203 +       for (i = 0; i < BLK_RING_SIZE; i++) {
44204 +               /* Not in use? */
44205 +               if (copy[i].request == 0)
44206 +                       continue;
44207 +
44208 +               /* Grab a request slot and copy shadow state into it. */
44209 +               req = RING_GET_REQUEST(
44210 +                       &info->ring, info->ring.req_prod_pvt);
44211 +               *req = copy[i].req;
44212 +
44213 +               /* We get a new request id, and must reset the shadow state. */
44214 +               req->id = GET_ID_FROM_FREELIST(info);
44215 +               memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
44216 +
44217 +               /* Rewrite any grant references invalidated by susp/resume. */
44218 +               for (j = 0; j < req->nr_segments; j++)
44219 +                       gnttab_grant_foreign_access_ref(
44220 +                               req->seg[j].gref,
44221 +                               info->xbdev->otherend_id,
44222 +                               pfn_to_mfn(info->shadow[req->id].frame[j]),
44223 +                               rq_data_dir(
44224 +                                       (struct request *)
44225 +                                       info->shadow[req->id].request));
44226 +               info->shadow[req->id].req = *req;
44227 +
44228 +               info->ring.req_prod_pvt++;
44229 +       }
44230 +
44231 +       kfree(copy);
44232 +
44233 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
44234 +
44235 +       /* Now safe for us to use the shared ring */
44236 +       spin_lock_irq(&blkif_io_lock);
44237 +       info->connected = BLKIF_STATE_CONNECTED;
44238 +       spin_unlock_irq(&blkif_io_lock);
44239 +
44240 +       /* Send off requeued requests */
44241 +       flush_requests(info);
44242 +
44243 +       /* Kick any other new requests queued since we resumed */
44244 +       spin_lock_irq(&blkif_io_lock);
44245 +       kick_pending_request_queues(info);
44246 +       spin_unlock_irq(&blkif_io_lock);
44247 +}
44248 +
44249 +
44250 +/* ** Driver Registration ** */
44251 +
44252 +
44253 +static struct xenbus_device_id blkfront_ids[] = {
44254 +       { "vbd" },
44255 +       { "" }
44256 +};
44257 +
44258 +
44259 +static struct xenbus_driver blkfront = {
44260 +       .name = "vbd",
44261 +       .owner = THIS_MODULE,
44262 +       .ids = blkfront_ids,
44263 +       .probe = blkfront_probe,
44264 +       .remove = blkfront_remove,
44265 +       .resume = blkfront_resume,
44266 +       .otherend_changed = backend_changed,
44267 +};
44268 +
44269 +
44270 +static int __init xlblk_init(void)
44271 +{
44272 +       if (xen_init() < 0)
44273 +               return -ENODEV;
44274 +
44275 +       return xenbus_register_frontend(&blkfront);
44276 +}
44277 +module_init(xlblk_init);
44278 +
44279 +
44280 +static void xlblk_exit(void)
44281 +{
44282 +       return xenbus_unregister_driver(&blkfront);
44283 +}
44284 +module_exit(xlblk_exit);
44285 +
44286 +MODULE_LICENSE("Dual BSD/GPL");
44287 +
44288 +/*
44289 + * Local variables:
44290 + *  c-file-style: "linux"
44291 + *  indent-tabs-mode: t
44292 + *  c-indent-level: 8
44293 + *  c-basic-offset: 8
44294 + *  tab-width: 8
44295 + * End:
44296 + */
44297 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkfront/block.h tmp-linux-2.6-xen.patch/drivers/xen/blkfront/block.h
44298 --- ref-linux-2.6.16.9/drivers/xen/blkfront/block.h     1970-01-01 01:00:00.000000000 +0100
44299 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkfront/block.h        2006-04-10 00:05:52.000000000 +0200
44300 @@ -0,0 +1,165 @@
44301 +/******************************************************************************
44302 + * block.h
44303 + * 
44304 + * Shared definitions between all levels of XenLinux Virtual block devices.
44305 + * 
44306 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
44307 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
44308 + * Copyright (c) 2004-2005, Christian Limpach
44309 + * 
44310 + * This program is free software; you can redistribute it and/or
44311 + * modify it under the terms of the GNU General Public License version 2
44312 + * as published by the Free Software Foundation; or, when distributed
44313 + * separately from the Linux kernel or incorporated into other
44314 + * software packages, subject to the following license:
44315 + * 
44316 + * Permission is hereby granted, free of charge, to any person obtaining a copy
44317 + * of this source file (the "Software"), to deal in the Software without
44318 + * restriction, including without limitation the rights to use, copy, modify,
44319 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
44320 + * and to permit persons to whom the Software is furnished to do so, subject to
44321 + * the following conditions:
44322 + * 
44323 + * The above copyright notice and this permission notice shall be included in
44324 + * all copies or substantial portions of the Software.
44325 + * 
44326 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
44327 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44328 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44329 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44330 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
44331 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
44332 + * IN THE SOFTWARE.
44333 + */
44334 +
44335 +#ifndef __XEN_DRIVERS_BLOCK_H__
44336 +#define __XEN_DRIVERS_BLOCK_H__
44337 +
44338 +#include <linux/config.h>
44339 +#include <linux/version.h>
44340 +#include <linux/module.h>
44341 +#include <linux/kernel.h>
44342 +#include <linux/sched.h>
44343 +#include <linux/slab.h>
44344 +#include <linux/string.h>
44345 +#include <linux/errno.h>
44346 +#include <linux/fs.h>
44347 +#include <linux/hdreg.h>
44348 +#include <linux/blkdev.h>
44349 +#include <linux/major.h>
44350 +#include <linux/devfs_fs_kernel.h>
44351 +#include <asm/hypervisor.h>
44352 +#include <xen/xenbus.h>
44353 +#include <xen/gnttab.h>
44354 +#include <xen/interface/xen.h>
44355 +#include <xen/interface/io/blkif.h>
44356 +#include <xen/interface/io/ring.h>
44357 +#include <asm/io.h>
44358 +#include <asm/atomic.h>
44359 +#include <asm/uaccess.h>
44360 +
44361 +#if 1
44362 +#define IPRINTK(fmt, args...) \
44363 +    printk(KERN_INFO "xen_blk: " fmt, ##args)
44364 +#else
44365 +#define IPRINTK(fmt, args...) ((void)0)
44366 +#endif
44367 +
44368 +#if 1
44369 +#define WPRINTK(fmt, args...) \
44370 +    printk(KERN_WARNING "xen_blk: " fmt, ##args)
44371 +#else
44372 +#define WPRINTK(fmt, args...) ((void)0)
44373 +#endif
44374 +
44375 +#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
44376 +
44377 +#if 0
44378 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
44379 +#else
44380 +#define DPRINTK_IOCTL(_f, _a...) ((void)0)
44381 +#endif
44382 +
44383 +struct xlbd_type_info
44384 +{
44385 +       int partn_shift;
44386 +       int disks_per_major;
44387 +       char *devname;
44388 +       char *diskname;
44389 +};
44390 +
44391 +struct xlbd_major_info
44392 +{
44393 +       int major;
44394 +       int index;
44395 +       int usage;
44396 +       struct xlbd_type_info *type;
44397 +};
44398 +
44399 +struct blk_shadow {
44400 +       blkif_request_t req;
44401 +       unsigned long request;
44402 +       unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
44403 +};
44404 +
44405 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
44406 +
44407 +/*
44408 + * We have one of these per vbd, whether ide, scsi or 'other'.  They
44409 + * hang in private_data off the gendisk structure. We may end up
44410 + * putting all kinds of interesting stuff here :-)
44411 + */
44412 +struct blkfront_info
44413 +{
44414 +       struct xenbus_device *xbdev;
44415 +       dev_t dev;
44416 +       struct gendisk *gd;
44417 +       int vdevice;
44418 +       blkif_vdev_t handle;
44419 +       int connected;
44420 +       int ring_ref;
44421 +       blkif_front_ring_t ring;
44422 +       unsigned int evtchn, irq;
44423 +       struct xlbd_major_info *mi;
44424 +       request_queue_t *rq;
44425 +       struct work_struct work;
44426 +       struct gnttab_free_callback callback;
44427 +       struct blk_shadow shadow[BLK_RING_SIZE];
44428 +       unsigned long shadow_free;
44429 +
44430 +       /**
44431 +        * The number of people holding this device open.  We won't allow a
44432 +        * hot-unplug unless this is 0.
44433 +        */
44434 +       int users;
44435 +};
44436 +
44437 +extern spinlock_t blkif_io_lock;
44438 +
44439 +extern int blkif_open(struct inode *inode, struct file *filep);
44440 +extern int blkif_release(struct inode *inode, struct file *filep);
44441 +extern int blkif_ioctl(struct inode *inode, struct file *filep,
44442 +                       unsigned command, unsigned long argument);
44443 +extern int blkif_check(dev_t dev);
44444 +extern int blkif_revalidate(dev_t dev);
44445 +extern void do_blkif_request (request_queue_t *rq);
44446 +
44447 +/* Virtual block-device subsystem. */
44448 +/* Note that xlvbd_add doesn't call add_disk for you: you're expected
44449 +   to call add_disk on info->gd once the disk is properly connected
44450 +   up. */
44451 +int xlvbd_add(blkif_sector_t capacity, int device,
44452 +             u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
44453 +void xlvbd_del(struct blkfront_info *info);
44454 +
44455 +#endif /* __XEN_DRIVERS_BLOCK_H__ */
44456 +
44457 +/*
44458 + * Local variables:
44459 + *  c-file-style: "linux"
44460 + *  indent-tabs-mode: t
44461 + *  c-indent-level: 8
44462 + *  c-basic-offset: 8
44463 + *  tab-width: 8
44464 + * End:
44465 + */
44466 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkfront/Makefile tmp-linux-2.6-xen.patch/drivers/xen/blkfront/Makefile
44467 --- ref-linux-2.6.16.9/drivers/xen/blkfront/Makefile    1970-01-01 01:00:00.000000000 +0100
44468 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkfront/Makefile       2006-04-10 00:05:52.000000000 +0200
44469 @@ -0,0 +1,5 @@
44470 +
44471 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      := xenblk.o
44472 +
44473 +xenblk-objs := blkfront.o vbd.o
44474 +
44475 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blkfront/vbd.c tmp-linux-2.6-xen.patch/drivers/xen/blkfront/vbd.c
44476 --- ref-linux-2.6.16.9/drivers/xen/blkfront/vbd.c       1970-01-01 01:00:00.000000000 +0100
44477 +++ tmp-linux-2.6-xen.patch/drivers/xen/blkfront/vbd.c  2006-04-10 00:05:52.000000000 +0200
44478 @@ -0,0 +1,327 @@
44479 +/******************************************************************************
44480 + * vbd.c
44481 + * 
44482 + * XenLinux virtual block-device driver (xvd).
44483 + * 
44484 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
44485 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
44486 + * Copyright (c) 2004-2005, Christian Limpach
44487 + * 
44488 + * This program is free software; you can redistribute it and/or
44489 + * modify it under the terms of the GNU General Public License version 2
44490 + * as published by the Free Software Foundation; or, when distributed
44491 + * separately from the Linux kernel or incorporated into other
44492 + * software packages, subject to the following license:
44493 + * 
44494 + * Permission is hereby granted, free of charge, to any person obtaining a copy
44495 + * of this source file (the "Software"), to deal in the Software without
44496 + * restriction, including without limitation the rights to use, copy, modify,
44497 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
44498 + * and to permit persons to whom the Software is furnished to do so, subject to
44499 + * the following conditions:
44500 + * 
44501 + * The above copyright notice and this permission notice shall be included in
44502 + * all copies or substantial portions of the Software.
44503 + * 
44504 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
44505 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44506 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44507 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44508 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
44509 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
44510 + * IN THE SOFTWARE.
44511 + */
44512 +
44513 +#include "block.h"
44514 +#include <linux/blkdev.h>
44515 +#include <linux/list.h>
44516 +
44517 +#define BLKIF_MAJOR(dev) ((dev)>>8)
44518 +#define BLKIF_MINOR(dev) ((dev) & 0xff)
44519 +
44520 +/*
44521 + * For convenience we distinguish between ide, scsi and 'other' (i.e.,
44522 + * potentially combinations of the two) in the naming scheme and in a few other
44523 + * places.
44524 + */
44525 +
44526 +#define NUM_IDE_MAJORS 10
44527 +#define NUM_SCSI_MAJORS 9
44528 +#define NUM_VBD_MAJORS 1
44529 +
44530 +static struct xlbd_type_info xlbd_ide_type = {
44531 +       .partn_shift = 6,
44532 +       .disks_per_major = 2,
44533 +       .devname = "ide",
44534 +       .diskname = "hd",
44535 +};
44536 +
44537 +static struct xlbd_type_info xlbd_scsi_type = {
44538 +       .partn_shift = 4,
44539 +       .disks_per_major = 16,
44540 +       .devname = "sd",
44541 +       .diskname = "sd",
44542 +};
44543 +
44544 +static struct xlbd_type_info xlbd_vbd_type = {
44545 +       .partn_shift = 4,
44546 +       .disks_per_major = 16,
44547 +       .devname = "xvd",
44548 +       .diskname = "xvd",
44549 +};
44550 +
44551 +static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
44552 +                                        NUM_VBD_MAJORS];
44553 +
44554 +#define XLBD_MAJOR_IDE_START   0
44555 +#define XLBD_MAJOR_SCSI_START  (NUM_IDE_MAJORS)
44556 +#define XLBD_MAJOR_VBD_START   (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
44557 +
44558 +#define XLBD_MAJOR_IDE_RANGE   XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
44559 +#define XLBD_MAJOR_SCSI_RANGE  XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
44560 +#define XLBD_MAJOR_VBD_RANGE   XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
44561 +
44562 +/* Information about our VBDs. */
44563 +#define MAX_VBDS 64
44564 +static LIST_HEAD(vbds_list);
44565 +
44566 +static struct block_device_operations xlvbd_block_fops =
44567 +{
44568 +       .owner = THIS_MODULE,
44569 +       .open = blkif_open,
44570 +       .release = blkif_release,
44571 +       .ioctl  = blkif_ioctl,
44572 +};
44573 +
44574 +spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED;
44575 +
44576 +static struct xlbd_major_info *
44577 +xlbd_alloc_major_info(int major, int minor, int index)
44578 +{
44579 +       struct xlbd_major_info *ptr;
44580 +
44581 +       ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
44582 +       if (ptr == NULL)
44583 +               return NULL;
44584 +
44585 +       ptr->major = major;
44586 +
44587 +       switch (index) {
44588 +       case XLBD_MAJOR_IDE_RANGE:
44589 +               ptr->type = &xlbd_ide_type;
44590 +               ptr->index = index - XLBD_MAJOR_IDE_START;
44591 +               break;
44592 +       case XLBD_MAJOR_SCSI_RANGE:
44593 +               ptr->type = &xlbd_scsi_type;
44594 +               ptr->index = index - XLBD_MAJOR_SCSI_START;
44595 +               break;
44596 +       case XLBD_MAJOR_VBD_RANGE:
44597 +               ptr->type = &xlbd_vbd_type;
44598 +               ptr->index = index - XLBD_MAJOR_VBD_START;
44599 +               break;
44600 +       }
44601 +
44602 +       printk("Registering block device major %i\n", ptr->major);
44603 +       if (register_blkdev(ptr->major, ptr->type->devname)) {
44604 +               WPRINTK("can't get major %d with name %s\n",
44605 +                       ptr->major, ptr->type->devname);
44606 +               kfree(ptr);
44607 +               return NULL;
44608 +       }
44609 +
44610 +       devfs_mk_dir(ptr->type->devname);
44611 +       major_info[index] = ptr;
44612 +       return ptr;
44613 +}
44614 +
44615 +static struct xlbd_major_info *
44616 +xlbd_get_major_info(int vdevice)
44617 +{
44618 +       struct xlbd_major_info *mi;
44619 +       int major, minor, index;
44620 +
44621 +       major = BLKIF_MAJOR(vdevice);
44622 +       minor = BLKIF_MINOR(vdevice);
44623 +
44624 +       switch (major) {
44625 +       case IDE0_MAJOR: index = 0; break;
44626 +       case IDE1_MAJOR: index = 1; break;
44627 +       case IDE2_MAJOR: index = 2; break;
44628 +       case IDE3_MAJOR: index = 3; break;
44629 +       case IDE4_MAJOR: index = 4; break;
44630 +       case IDE5_MAJOR: index = 5; break;
44631 +       case IDE6_MAJOR: index = 6; break;
44632 +       case IDE7_MAJOR: index = 7; break;
44633 +       case IDE8_MAJOR: index = 8; break;
44634 +       case IDE9_MAJOR: index = 9; break;
44635 +       case SCSI_DISK0_MAJOR: index = 10; break;
44636 +       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
44637 +               index = 11 + major - SCSI_DISK1_MAJOR;
44638 +               break;
44639 +       case SCSI_CDROM_MAJOR: index = 18; break;
44640 +       default: index = 19; break;
44641 +       }
44642 +
44643 +       mi = ((major_info[index] != NULL) ? major_info[index] :
44644 +             xlbd_alloc_major_info(major, minor, index));
44645 +       if (mi)
44646 +               mi->usage++;
44647 +       return mi;
44648 +}
44649 +
44650 +static void
44651 +xlbd_put_major_info(struct xlbd_major_info *mi)
44652 +{
44653 +       mi->usage--;
44654 +       /* XXX: release major if 0 */
44655 +}
44656 +
44657 +static int
44658 +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
44659 +{
44660 +       request_queue_t *rq;
44661 +
44662 +       rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
44663 +       if (rq == NULL)
44664 +               return -1;
44665 +
44666 +       elevator_init(rq, "noop");
44667 +
44668 +       /* Hard sector size and max sectors impersonate the equiv. hardware. */
44669 +       blk_queue_hardsect_size(rq, sector_size);
44670 +       blk_queue_max_sectors(rq, 512);
44671 +
44672 +       /* Each segment in a request is up to an aligned page in size. */
44673 +       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
44674 +       blk_queue_max_segment_size(rq, PAGE_SIZE);
44675 +
44676 +       /* Ensure a merged request will fit in a single I/O ring slot. */
44677 +       blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
44678 +       blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
44679 +
44680 +       /* Make sure buffer addresses are sector-aligned. */
44681 +       blk_queue_dma_alignment(rq, 511);
44682 +
44683 +       gd->queue = rq;
44684 +
44685 +       return 0;
44686 +}
44687 +
44688 +static int
44689 +xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice,
44690 +                   u16 vdisk_info, u16 sector_size,
44691 +                   struct blkfront_info *info)
44692 +{
44693 +       struct gendisk *gd;
44694 +       struct xlbd_major_info *mi;
44695 +       int nr_minors = 1;
44696 +       int err = -ENODEV;
44697 +
44698 +       BUG_ON(info->gd != NULL);
44699 +       BUG_ON(info->mi != NULL);
44700 +       BUG_ON(info->rq != NULL);
44701 +
44702 +       mi = xlbd_get_major_info(vdevice);
44703 +       if (mi == NULL)
44704 +               goto out;
44705 +       info->mi = mi;
44706 +
44707 +       if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
44708 +               nr_minors = 1 << mi->type->partn_shift;
44709 +
44710 +       gd = alloc_disk(nr_minors);
44711 +       if (gd == NULL)
44712 +               goto out;
44713 +
44714 +       if (nr_minors > 1)
44715 +               sprintf(gd->disk_name, "%s%c", mi->type->diskname,
44716 +                       'a' + mi->index * mi->type->disks_per_major +
44717 +                       (minor >> mi->type->partn_shift));
44718 +       else
44719 +               sprintf(gd->disk_name, "%s%c%d", mi->type->diskname,
44720 +                       'a' + mi->index * mi->type->disks_per_major +
44721 +                       (minor >> mi->type->partn_shift),
44722 +                       minor & ((1 << mi->type->partn_shift) - 1));
44723 +
44724 +       gd->major = mi->major;
44725 +       gd->first_minor = minor;
44726 +       gd->fops = &xlvbd_block_fops;
44727 +       gd->private_data = info;
44728 +       gd->driverfs_dev = &(info->xbdev->dev);
44729 +       set_capacity(gd, capacity);
44730 +
44731 +       if (xlvbd_init_blk_queue(gd, sector_size)) {
44732 +               del_gendisk(gd);
44733 +               goto out;
44734 +       }
44735 +
44736 +       info->rq = gd->queue;
44737 +
44738 +       if (vdisk_info & VDISK_READONLY)
44739 +               set_disk_ro(gd, 1);
44740 +
44741 +       if (vdisk_info & VDISK_REMOVABLE)
44742 +               gd->flags |= GENHD_FL_REMOVABLE;
44743 +
44744 +       if (vdisk_info & VDISK_CDROM)
44745 +               gd->flags |= GENHD_FL_CD;
44746 +
44747 +       info->gd = gd;
44748 +
44749 +       return 0;
44750 +
44751 + out:
44752 +       if (mi)
44753 +               xlbd_put_major_info(mi);
44754 +       info->mi = NULL;
44755 +       return err;
44756 +}
44757 +
44758 +int
44759 +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
44760 +         u16 sector_size, struct blkfront_info *info)
44761 +{
44762 +       struct block_device *bd;
44763 +       int err = 0;
44764 +
44765 +       info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice));
44766 +
44767 +       bd = bdget(info->dev);
44768 +       if (bd == NULL)
44769 +               return -ENODEV;
44770 +
44771 +       err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice,
44772 +                                 vdisk_info, sector_size, info);
44773 +
44774 +       bdput(bd);
44775 +       return err;
44776 +}
44777 +
44778 +void
44779 +xlvbd_del(struct blkfront_info *info)
44780 +{
44781 +       if (info->mi == NULL)
44782 +               return;
44783 +
44784 +       BUG_ON(info->gd == NULL);
44785 +       del_gendisk(info->gd);
44786 +       put_disk(info->gd);
44787 +       info->gd = NULL;
44788 +
44789 +       xlbd_put_major_info(info->mi);
44790 +       info->mi = NULL;
44791 +
44792 +       BUG_ON(info->rq == NULL);
44793 +       blk_cleanup_queue(info->rq);
44794 +       info->rq = NULL;
44795 +}
44796 +
44797 +/*
44798 + * Local variables:
44799 + *  c-file-style: "linux"
44800 + *  indent-tabs-mode: t
44801 + *  c-indent-level: 8
44802 + *  c-basic-offset: 8
44803 + *  tab-width: 8
44804 + * End:
44805 + */
44806 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blktap/blktap.c tmp-linux-2.6-xen.patch/drivers/xen/blktap/blktap.c
44807 --- ref-linux-2.6.16.9/drivers/xen/blktap/blktap.c      1970-01-01 01:00:00.000000000 +0100
44808 +++ tmp-linux-2.6-xen.patch/drivers/xen/blktap/blktap.c 2006-04-10 00:05:52.000000000 +0200
44809 @@ -0,0 +1,910 @@
44810 +/******************************************************************************
44811 + * arch/xen/drivers/blkif/blktap/blktap.c
44812 + * 
44813 + * This is a modified version of the block backend driver that remaps requests
44814 + * to a user-space memory region.  It is intended to be used to write 
44815 + * application-level servers that provide block interfaces to client VMs.
44816 + */
44817 +
44818 +#include <linux/kernel.h>
44819 +#include <linux/spinlock.h>
44820 +#include <xen/balloon.h>
44821 +#include <linux/kernel.h>
44822 +#include <linux/fs.h>
44823 +#include <linux/mm.h>
44824 +#include <linux/miscdevice.h>
44825 +#include <linux/errno.h>
44826 +#include <linux/major.h>
44827 +#include <linux/gfp.h>
44828 +#include <linux/poll.h>
44829 +#include <asm/tlbflush.h>
44830 +#include "common.h"
44831 +
44832 +/* Only one process may open /dev/xen/blktap at any time. */
44833 +static unsigned long blktap_dev_inuse;
44834 +unsigned long blktap_ring_ok; /* make this ring->state */
44835 +
44836 +/* Rings up to user space. */
44837 +static blkif_front_ring_t blktap_ufe_ring;
44838 +
44839 +/* for poll: */
44840 +static wait_queue_head_t blktap_wait;
44841 +
44842 +/* current switching mode */
44843 +static unsigned long blktap_mode;
44844 +
44845 +/* local prototypes */
44846 +static int blktap_read_ufe_ring(void);
44847 +
44848 +
44849 +/* /dev/xen/blktap resides at device number major=10, minor=200        */ 
44850 +#define BLKTAP_MINOR 202
44851 +
44852 +/* blktap IOCTLs:                                                      */
44853 +#define BLKTAP_IOCTL_KICK_FE         1
44854 +#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
44855 +#define BLKTAP_IOCTL_SETMODE         3
44856 +#define BLKTAP_IOCTL_PRINT_IDXS      100  
44857 +
44858 +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
44859 +#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
44860 +#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
44861 +#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp. */
44862 +#define BLKTAP_MODE_COPY_FE          0x00000004  /* unimp. */
44863 +#define BLKTAP_MODE_COPY_BE          0x00000008  /* unimp. */
44864 +#define BLKTAP_MODE_COPY_FE_PAGES    0x00000010  /* unimp. */
44865 +#define BLKTAP_MODE_COPY_BE_PAGES    0x00000020  /* unimp. */
44866 +
44867 +#define BLKTAP_MODE_INTERPOSE \
44868 +           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
44869 +
44870 +#define BLKTAP_MODE_COPY_BOTH \
44871 +           (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
44872 +
44873 +#define BLKTAP_MODE_COPY_BOTH_PAGES \
44874 +           (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
44875 +
44876 +static inline int BLKTAP_MODE_VALID(unsigned long arg)
44877 +{
44878 +       return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
44879 +               (arg == BLKTAP_MODE_INTERCEPT_FE) ||
44880 +               (arg == BLKTAP_MODE_INTERPOSE   ));
44881 +/*
44882 +  return (
44883 +  ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
44884 +  ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
44885 +  ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
44886 +  ( arg == BLKTAP_MODE_INTERPOSE    ) ||
44887 +  ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
44888 +  ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
44889 +  ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
44890 +  );
44891 +*/
44892 +}
44893 +
44894 +
44895 +/******************************************************************
44896 + * MMAP REGION
44897 + */
44898 +
44899 +/*
44900 + * We use a big chunk of address space to map in-flight requests into,
44901 + * and export this region up to user-space.  See the comments in blkback
44902 + * about this -- the two must be kept in sync if the tap is used as a 
44903 + * passthrough.
44904 + */
44905 +
44906 +#define MAX_PENDING_REQS 64
44907 +#define BATCH_PER_DOMAIN 16
44908 +
44909 +/* immediately before the mmap area, we have a bunch of pages reserved
44910 + * for shared memory rings.
44911 + */
44912 +#define RING_PAGES 1 /* Front */ 
44913 +
44914 +/* Where things are inside the device mapping. */
44915 +struct vm_area_struct *blktap_vma = NULL;
44916 +unsigned long mmap_vstart;  /* Kernel pages for mapping in data. */
44917 +unsigned long rings_vstart; /* start of mmaped vma               */
44918 +unsigned long user_vstart;  /* start of user mappings            */
44919 +
44920 +#define MMAP_PAGES                                             \
44921 +       (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
44922 +#define MMAP_VADDR(_start, _req,_seg)                                  \
44923 +       (_start +                                                       \
44924 +        ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
44925 +        ((_seg) * PAGE_SIZE))
44926 +
44927 +/*
44928 + * Each outstanding request that we've passed to the lower device layers has a 
44929 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
44930 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
44931 + * response queued for it, with the saved 'id' passed back.
44932 + */
44933 +typedef struct {
44934 +       blkif_t       *blkif;
44935 +       unsigned long  id;
44936 +       int            nr_pages;
44937 +       atomic_t       pendcnt;
44938 +       unsigned short operation;
44939 +       int            status;
44940 +} pending_req_t;
44941 +
44942 +/*
44943 + * We can't allocate pending_req's in order, since they may complete out of 
44944 + * order. We therefore maintain an allocation ring. This ring also indicates 
44945 + * when enough work has been passed down -- at that point the allocation ring 
44946 + * will be empty.
44947 + */
44948 +static pending_req_t pending_reqs[MAX_PENDING_REQS];
44949 +static unsigned char pending_ring[MAX_PENDING_REQS];
44950 +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
44951 +/* NB. We use a different index type to differentiate from shared blk rings. */
44952 +typedef unsigned int PEND_RING_IDX;
44953 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
44954 +static PEND_RING_IDX pending_prod, pending_cons;
44955 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
44956 +
44957 +/* Requests passing through the tap to the backend hijack the id field
44958 + * in the request message.  In it we put the AR index _AND_ the fe domid.
44959 + * the domid is used by the backend to map the pages properly.
44960 + */
44961 +
44962 +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
44963 +{
44964 +       return ((fe_dom << 16) | MASK_PEND_IDX(idx));
44965 +}
44966 +
44967 +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) 
44968 +{ 
44969 +       return (PEND_RING_IDX)(id & 0x0000ffff);
44970 +}
44971 +
44972 +extern inline domid_t ID_TO_DOM(unsigned long id) 
44973 +{ 
44974 +       return (domid_t)(id >> 16); 
44975 +}
44976 +
44977 +
44978 +
44979 +/******************************************************************
44980 + * GRANT HANDLES
44981 + */
44982 +
44983 +/* When using grant tables to map a frame for device access then the
44984 + * handle returned must be used to unmap the frame. This is needed to
44985 + * drop the ref count on the frame.
44986 + */
44987 +struct grant_handle_pair
44988 +{
44989 +       grant_handle_t kernel;
44990 +       grant_handle_t user;
44991 +};
44992 +static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
44993 +#define pending_handle(_idx, _i) \
44994 +    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
44995 +#define BLKTAP_INVALID_HANDLE(_g) \
44996 +    (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
44997 +#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
44998 +    (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
44999 +    } while(0)
45000 +
45001 +
45002 +/******************************************************************
45003 + * BLKTAP VM OPS
45004 + */
45005 +
45006 +static struct page *blktap_nopage(struct vm_area_struct *vma,
45007 +                                 unsigned long address,
45008 +                                 int *type)
45009 +{
45010 +       /*
45011 +        * if the page has not been mapped in by the driver then generate
45012 +        * a SIGBUS to the domain.
45013 +        */
45014 +       force_sig(SIGBUS, current);
45015 +
45016 +       return 0;
45017 +}
45018 +
45019 +struct vm_operations_struct blktap_vm_ops = {
45020 +       .nopage = blktap_nopage,
45021 +};
45022 +
45023 +/******************************************************************
45024 + * BLKTAP FILE OPS
45025 + */
45026 +
45027 +static int blktap_open(struct inode *inode, struct file *filp)
45028 +{
45029 +       blkif_sring_t *sring;
45030 +
45031 +       if (test_and_set_bit(0, &blktap_dev_inuse))
45032 +               return -EBUSY;
45033 +    
45034 +       /* Allocate the fe ring. */
45035 +       sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
45036 +       if (sring == NULL)
45037 +               return -ENOMEM;
45038 +
45039 +       SetPageReserved(virt_to_page(sring));
45040 +    
45041 +       SHARED_RING_INIT(sring);
45042 +       FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
45043 +
45044 +       return 0;
45045 +}
45046 +
45047 +static int blktap_release(struct inode *inode, struct file *filp)
45048 +{
45049 +       blktap_dev_inuse = 0;
45050 +       blktap_ring_ok = 0;
45051 +
45052 +       /* Free the ring page. */
45053 +       ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
45054 +       free_page((unsigned long) blktap_ufe_ring.sring);
45055 +
45056 +       /* Clear any active mappings and free foreign map table */
45057 +       if (blktap_vma != NULL) {
45058 +               zap_page_range(
45059 +                       blktap_vma, blktap_vma->vm_start, 
45060 +                       blktap_vma->vm_end - blktap_vma->vm_start, NULL);
45061 +               blktap_vma = NULL;
45062 +       }
45063 +
45064 +       return 0;
45065 +}
45066 +
45067 +
45068 +/* Note on mmap:
45069 + * We need to map pages to user space in a way that will allow the block
45070 + * subsystem set up direct IO to them.  This couldn't be done before, because
45071 + * there isn't really a sane way to translate a user virtual address down to a 
45072 + * physical address when the page belongs to another domain.
45073 + *
45074 + * My first approach was to map the page in to kernel memory, add an entry
45075 + * for it in the physical frame list (using alloc_lomem_region as in blkback)
45076 + * and then attempt to map that page up to user space.  This is disallowed
45077 + * by xen though, which realizes that we don't really own the machine frame
45078 + * underlying the physical page.
45079 + *
45080 + * The new approach is to provide explicit support for this in xen linux.
45081 + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
45082 + * mapped from other vms.  vma->vm_private_data is set up as a mapping 
45083 + * from pages to actual page structs.  There is a new clause in get_user_pages
45084 + * that does the right thing for this sort of mapping.
45085 + */
45086 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
45087 +{
45088 +       int size;
45089 +       struct page **map;
45090 +       int i;
45091 +
45092 +       DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
45093 +               vma->vm_start, vma->vm_end);
45094 +
45095 +       vma->vm_flags |= VM_RESERVED;
45096 +       vma->vm_ops = &blktap_vm_ops;
45097 +
45098 +       size = vma->vm_end - vma->vm_start;
45099 +       if (size != ((MMAP_PAGES + RING_PAGES) << PAGE_SHIFT)) {
45100 +               printk(KERN_INFO 
45101 +                      "blktap: you _must_ map exactly %d pages!\n",
45102 +                      MMAP_PAGES + RING_PAGES);
45103 +               return -EAGAIN;
45104 +       }
45105 +
45106 +       size >>= PAGE_SHIFT;
45107 +       DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
45108 +    
45109 +       rings_vstart = vma->vm_start;
45110 +       user_vstart  = rings_vstart + (RING_PAGES << PAGE_SHIFT);
45111 +    
45112 +       /* Map the ring pages to the start of the region and reserve it. */
45113 +
45114 +       /* not sure if I really need to do this... */
45115 +       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
45116 +
45117 +       if (remap_pfn_range(vma, vma->vm_start, 
45118 +                           __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, 
45119 +                           PAGE_SIZE, vma->vm_page_prot)) {
45120 +               WPRINTK("Mapping user ring failed!\n");
45121 +               goto fail;
45122 +       }
45123 +
45124 +       /* Mark this VM as containing foreign pages, and set up mappings. */
45125 +       map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
45126 +                     * sizeof(struct page_struct*),
45127 +                     GFP_KERNEL);
45128 +       if (map == NULL) {
45129 +               WPRINTK("Couldn't alloc VM_FOREIGH map.\n");
45130 +               goto fail;
45131 +       }
45132 +
45133 +       for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
45134 +               map[i] = NULL;
45135 +    
45136 +       vma->vm_private_data = map;
45137 +       vma->vm_flags |= VM_FOREIGN;
45138 +
45139 +       blktap_vma = vma;
45140 +       blktap_ring_ok = 1;
45141 +
45142 +       return 0;
45143 + fail:
45144 +       /* Clear any active mappings. */
45145 +       zap_page_range(vma, vma->vm_start, 
45146 +                      vma->vm_end - vma->vm_start, NULL);
45147 +
45148 +       return -ENOMEM;
45149 +}
45150 +
45151 +static int blktap_ioctl(struct inode *inode, struct file *filp,
45152 +                        unsigned int cmd, unsigned long arg)
45153 +{
45154 +       switch(cmd) {
45155 +       case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
45156 +               return blktap_read_ufe_ring();
45157 +
45158 +       case BLKTAP_IOCTL_SETMODE:
45159 +               if (BLKTAP_MODE_VALID(arg)) {
45160 +                       blktap_mode = arg;
45161 +                       /* XXX: may need to flush rings here. */
45162 +                       printk(KERN_INFO "blktap: set mode to %lx\n", arg);
45163 +                       return 0;
45164 +               }
45165 +       case BLKTAP_IOCTL_PRINT_IDXS:
45166 +        {
45167 +               //print_fe_ring_idxs();
45168 +               WPRINTK("User Rings: \n-----------\n");
45169 +               WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
45170 +                       "| req_prod: %2d, rsp_prod: %2d\n",
45171 +                       blktap_ufe_ring.rsp_cons,
45172 +                       blktap_ufe_ring.req_prod_pvt,
45173 +                       blktap_ufe_ring.sring->req_prod,
45174 +                       blktap_ufe_ring.sring->rsp_prod);
45175 +            
45176 +        }
45177 +       }
45178 +       return -ENOIOCTLCMD;
45179 +}
45180 +
45181 +static unsigned int blktap_poll(struct file *file, poll_table *wait)
45182 +{
45183 +       poll_wait(file, &blktap_wait, wait);
45184 +       if (blktap_ufe_ring.req_prod_pvt != blktap_ufe_ring.sring->req_prod) {
45185 +               flush_tlb_all();
45186 +               RING_PUSH_REQUESTS(&blktap_ufe_ring);
45187 +               return POLLIN | POLLRDNORM;
45188 +       }
45189 +
45190 +       return 0;
45191 +}
45192 +
45193 +void blktap_kick_user(void)
45194 +{
45195 +       /* blktap_ring->req_prod = blktap_req_prod; */
45196 +       wake_up_interruptible(&blktap_wait);
45197 +}
45198 +
45199 +static struct file_operations blktap_fops = {
45200 +       .owner   = THIS_MODULE,
45201 +       .poll    = blktap_poll,
45202 +       .ioctl   = blktap_ioctl,
45203 +       .open    = blktap_open,
45204 +       .release = blktap_release,
45205 +       .mmap    = blktap_mmap,
45206 +};
45207 +
45208 +
45209 +
45210 +static int do_block_io_op(blkif_t *blkif, int max_to_do);
45211 +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
45212 +static void make_response(blkif_t *blkif, unsigned long id, 
45213 +                          unsigned short op, int st);
45214 +
45215 +
45216 +static void fast_flush_area(int idx, int nr_pages)
45217 +{
45218 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
45219 +       unsigned int i, op = 0;
45220 +       struct grant_handle_pair *handle;
45221 +       uint64_t ptep;
45222 +       int ret;
45223 +
45224 +       for ( i = 0; i < nr_pages; i++)
45225 +       {
45226 +               handle = &pending_handle(idx, i);
45227 +               if (BLKTAP_INVALID_HANDLE(handle))
45228 +                       continue;
45229 +
45230 +               unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
45231 +               unmap[op].dev_bus_addr = 0;
45232 +               unmap[op].handle = handle->kernel;
45233 +               op++;
45234 +
45235 +               if (create_lookup_pte_addr(
45236 +                           blktap_vma->vm_mm,
45237 +                           MMAP_VADDR(user_vstart, idx, i), 
45238 +                           &ptep) !=0) {
45239 +                       DPRINTK("Couldn't get a pte addr!\n");
45240 +                       return;
45241 +               }
45242 +               unmap[op].host_addr    = ptep;
45243 +               unmap[op].dev_bus_addr = 0;
45244 +               unmap[op].handle       = handle->user;
45245 +               op++;
45246 +            
45247 +               BLKTAP_INVALIDATE_HANDLE(handle);
45248 +       }
45249 +
45250 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, op);
45251 +       BUG_ON(ret);
45252 +
45253 +       if (blktap_vma != NULL)
45254 +               zap_page_range(blktap_vma, 
45255 +                              MMAP_VADDR(user_vstart, idx, 0), 
45256 +                              nr_pages << PAGE_SHIFT, NULL);
45257 +}
45258 +
45259 +/******************************************************************
45260 + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
45261 + */
45262 +
45263 +static struct list_head blkio_schedule_list;
45264 +static spinlock_t blkio_schedule_list_lock;
45265 +
45266 +static int __on_blkdev_list(blkif_t *blkif)
45267 +{
45268 +       return blkif->blkdev_list.next != NULL;
45269 +}
45270 +
45271 +static void remove_from_blkdev_list(blkif_t *blkif)
45272 +{
45273 +       unsigned long flags;
45274 +
45275 +       if (!__on_blkdev_list(blkif))
45276 +               return;
45277 +
45278 +       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
45279 +       if (__on_blkdev_list(blkif)) {
45280 +               list_del(&blkif->blkdev_list);
45281 +               blkif->blkdev_list.next = NULL;
45282 +               blkif_put(blkif);
45283 +       }
45284 +       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
45285 +}
45286 +
45287 +static void add_to_blkdev_list_tail(blkif_t *blkif)
45288 +{
45289 +       unsigned long flags;
45290 +
45291 +       if (__on_blkdev_list(blkif))
45292 +               return;
45293 +
45294 +       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
45295 +       if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
45296 +               list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
45297 +               blkif_get(blkif);
45298 +       }
45299 +       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
45300 +}
45301 +
45302 +
45303 +/******************************************************************
45304 + * SCHEDULER FUNCTIONS
45305 + */
45306 +
45307 +static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
45308 +
45309 +static int blkio_schedule(void *arg)
45310 +{
45311 +       DECLARE_WAITQUEUE(wq, current);
45312 +
45313 +       blkif_t          *blkif;
45314 +       struct list_head *ent;
45315 +
45316 +       daemonize("xenblkd");
45317 +
45318 +       for (;;) {
45319 +               /* Wait for work to do. */
45320 +               add_wait_queue(&blkio_schedule_wait, &wq);
45321 +               set_current_state(TASK_INTERRUPTIBLE);
45322 +               if ((NR_PENDING_REQS == MAX_PENDING_REQS) || 
45323 +                   list_empty(&blkio_schedule_list))
45324 +                       schedule();
45325 +               __set_current_state(TASK_RUNNING);
45326 +               remove_wait_queue(&blkio_schedule_wait, &wq);
45327 +
45328 +               /* Queue up a batch of requests. */
45329 +               while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
45330 +                      !list_empty(&blkio_schedule_list)) {
45331 +                       ent = blkio_schedule_list.next;
45332 +                       blkif = list_entry(ent, blkif_t, blkdev_list);
45333 +                       blkif_get(blkif);
45334 +                       remove_from_blkdev_list(blkif);
45335 +                       if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
45336 +                               add_to_blkdev_list_tail(blkif);
45337 +                       blkif_put(blkif);
45338 +               }
45339 +       }
45340 +}
45341 +
45342 +static void maybe_trigger_blkio_schedule(void)
45343 +{
45344 +       /*
45345 +        * Needed so that two processes, who together make the following
45346 +        * predicate true, don't both read stale values and evaluate the
45347 +        * predicate incorrectly. Incredibly unlikely to stall the scheduler
45348 +        * on the x86, but...
45349 +        */
45350 +       smp_mb();
45351 +
45352 +       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
45353 +           !list_empty(&blkio_schedule_list))
45354 +               wake_up(&blkio_schedule_wait);
45355 +}
45356 +
45357 +
45358 +
45359 +/******************************************************************
45360 + * COMPLETION CALLBACK -- Called as bh->b_end_io()
45361 + */
45362 +
45363 +
45364 +static int blktap_read_ufe_ring(void)
45365 +{
45366 +       /* This is called to read responses from the UFE ring. */
45367 +
45368 +       RING_IDX i, j, rp;
45369 +       blkif_response_t *resp;
45370 +       blkif_t *blkif;
45371 +       int pending_idx;
45372 +       pending_req_t *pending_req;
45373 +       unsigned long     flags;
45374 +
45375 +       /* if we are forwarding from UFERring to FERing */
45376 +       if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
45377 +
45378 +               /* for each outstanding message on the UFEring  */
45379 +               rp = blktap_ufe_ring.sring->rsp_prod;
45380 +               rmb();
45381 +        
45382 +               for (i = blktap_ufe_ring.rsp_cons; i != rp; i++) {
45383 +                       resp = RING_GET_RESPONSE(&blktap_ufe_ring, i);
45384 +                       pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id));
45385 +                       pending_req = &pending_reqs[pending_idx];
45386 +            
45387 +                       blkif = pending_req->blkif;
45388 +                       for (j = 0; j < pending_req->nr_pages; j++) {
45389 +                               unsigned long vaddr;
45390 +                               struct page **map = blktap_vma->vm_private_data;
45391 +                               int offset; 
45392 +
45393 +                               vaddr  = MMAP_VADDR(user_vstart, pending_idx, j);
45394 +                               offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
45395 +
45396 +                               //ClearPageReserved(virt_to_page(vaddr));
45397 +                               ClearPageReserved((struct page *)map[offset]);
45398 +                               map[offset] = NULL;
45399 +                       }
45400 +
45401 +                       fast_flush_area(pending_idx, pending_req->nr_pages);
45402 +                       make_response(blkif, pending_req->id, resp->operation, 
45403 +                                     resp->status);
45404 +                       blkif_put(pending_req->blkif);
45405 +                       spin_lock_irqsave(&pend_prod_lock, flags);
45406 +                       pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
45407 +                       spin_unlock_irqrestore(&pend_prod_lock, flags);
45408 +               }
45409 +               blktap_ufe_ring.rsp_cons = i;
45410 +               maybe_trigger_blkio_schedule();
45411 +       }
45412 +       return 0;
45413 +}
45414 +
45415 +
45416 +/******************************************************************************
45417 + * NOTIFICATION FROM GUEST OS.
45418 + */
45419 +
45420 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
45421 +{
45422 +       blkif_t *blkif = dev_id;
45423 +       add_to_blkdev_list_tail(blkif);
45424 +       maybe_trigger_blkio_schedule();
45425 +       return IRQ_HANDLED;
45426 +}
45427 +
45428 +
45429 +
45430 +/******************************************************************
45431 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
45432 + */
45433 +
45434 +static int do_block_io_op(blkif_t *blkif, int max_to_do)
45435 +{
45436 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
45437 +       blkif_request_t *req;
45438 +       RING_IDX i, rp;
45439 +       int more_to_do = 0;
45440 +    
45441 +       rp = blk_ring->sring->req_prod;
45442 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
45443 +
45444 +       for (i = blk_ring->req_cons; 
45445 +            (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
45446 +            i++ ) {
45447 +               if ((max_to_do-- == 0) ||
45448 +                   (NR_PENDING_REQS == MAX_PENDING_REQS)) {
45449 +                       more_to_do = 1;
45450 +                       break;
45451 +               }
45452 +        
45453 +               req = RING_GET_REQUEST(blk_ring, i);
45454 +               switch (req->operation) {
45455 +               case BLKIF_OP_READ:
45456 +               case BLKIF_OP_WRITE:
45457 +                       dispatch_rw_block_io(blkif, req);
45458 +                       break;
45459 +
45460 +               default:
45461 +                       DPRINTK("error: unknown block io operation [%d]\n",
45462 +                               req->operation);
45463 +                       make_response(blkif, req->id, req->operation,
45464 +                                     BLKIF_RSP_ERROR);
45465 +                       break;
45466 +               }
45467 +       }
45468 +
45469 +       blk_ring->req_cons = i;
45470 +       blktap_kick_user();
45471 +
45472 +       return more_to_do;
45473 +}
45474 +
45475 +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
45476 +{
45477 +       blkif_request_t *target;
45478 +       int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
45479 +       pending_req_t *pending_req;
45480 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
45481 +       int op, ret;
45482 +       unsigned int nseg;
45483 +       int retval;
45484 +
45485 +       /* Check that number of segments is sane. */
45486 +       nseg = req->nr_segments;
45487 +       if (unlikely(nseg == 0) || 
45488 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
45489 +               DPRINTK("Bad number of segments in request (%d)\n", nseg);
45490 +               goto bad_descriptor;
45491 +       }
45492 +
45493 +       /* Make sure userspace is ready. */
45494 +       if (!blktap_ring_ok) {
45495 +               DPRINTK("blktap: ring not ready for requests!\n");
45496 +               goto bad_descriptor;
45497 +       }
45498 +    
45499 +
45500 +       if (RING_FULL(&blktap_ufe_ring)) {
45501 +               WPRINTK("blktap: fe_ring is full, can't add "
45502 +                       "(very broken!).\n");
45503 +               goto bad_descriptor;
45504 +       }
45505 +
45506 +       flush_cache_all(); /* a noop on intel... */
45507 +
45508 +       /* Map the foreign pages directly in to the application */    
45509 +       op = 0;
45510 +       for (i = 0; i < req->nr_segments; i++) {
45511 +
45512 +               unsigned long uvaddr;
45513 +               unsigned long kvaddr;
45514 +               uint64_t ptep;
45515 +
45516 +               uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
45517 +               kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
45518 +
45519 +               /* Map the remote page to kernel. */
45520 +               map[op].host_addr = kvaddr;
45521 +               map[op].dom   = blkif->domid;
45522 +               map[op].ref   = req->seg[i].gref;
45523 +               map[op].flags = GNTMAP_host_map;
45524 +               /* This needs a bit more thought in terms of interposition: 
45525 +                * If we want to be able to modify pages during write using 
45526 +                * grant table mappings, the guest will either need to allow 
45527 +                * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */
45528 +               if (req->operation == BLKIF_OP_WRITE)
45529 +                       map[op].flags |= GNTMAP_readonly;
45530 +               op++;
45531 +
45532 +               /* Now map it to user. */
45533 +               ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
45534 +               if (ret) {
45535 +                       DPRINTK("Couldn't get a pte addr!\n");
45536 +                       fast_flush_area(pending_idx, req->nr_segments);
45537 +                       goto bad_descriptor;
45538 +               }
45539 +
45540 +               map[op].host_addr = ptep;
45541 +               map[op].dom       = blkif->domid;
45542 +               map[op].ref       = req->seg[i].gref;
45543 +               map[op].flags     = GNTMAP_host_map | GNTMAP_application_map
45544 +                       | GNTMAP_contains_pte;
45545 +               /* Above interposition comment applies here as well. */
45546 +               if (req->operation == BLKIF_OP_WRITE)
45547 +                       map[op].flags |= GNTMAP_readonly;
45548 +               op++;
45549 +       }
45550 +
45551 +       retval = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
45552 +       BUG_ON(retval);
45553 +
45554 +       op = 0;
45555 +       for (i = 0; i < (req->nr_segments*2); i += 2) {
45556 +               unsigned long uvaddr;
45557 +               unsigned long kvaddr;
45558 +               unsigned long offset;
45559 +               int cancel = 0;
45560 +
45561 +               uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2);
45562 +               kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2);
45563 +
45564 +               if (unlikely(map[i].status)) {
45565 +                       DPRINTK("Error on kernel grant mapping (%d)\n",
45566 +                               map[i].status);
45567 +                       ret = map[i].status;
45568 +                       cancel = 1;
45569 +               }
45570 +
45571 +               if (unlikely(map[i+1].status)) {
45572 +                       DPRINTK("Error on user grant mapping (%d)\n",
45573 +                               map[i+1].status);
45574 +                       ret = map[i+1].status;
45575 +                       cancel = 1;
45576 +               }
45577 +
45578 +               if (cancel) {
45579 +                       fast_flush_area(pending_idx, req->nr_segments);
45580 +                       goto bad_descriptor;
45581 +               }
45582 +
45583 +               /* Set the necessary mappings in p2m and in the VM_FOREIGN 
45584 +                * vm_area_struct to allow user vaddr -> struct page lookups
45585 +                * to work.  This is needed for direct IO to foreign pages. */
45586 +               set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
45587 +                               FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
45588 +
45589 +               offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
45590 +               ((struct page **)blktap_vma->vm_private_data)[offset] =
45591 +                       pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
45592 +
45593 +               /* Save handles for unmapping later. */
45594 +               pending_handle(pending_idx, i/2).kernel = map[i].handle;
45595 +               pending_handle(pending_idx, i/2).user   = map[i+1].handle;
45596 +       }
45597 +
45598 +       /* Mark mapped pages as reserved: */
45599 +       for (i = 0; i < req->nr_segments; i++) {
45600 +               unsigned long kvaddr;
45601 +               kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
45602 +               SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
45603 +       }
45604 +
45605 +       pending_req = &pending_reqs[pending_idx];
45606 +       pending_req->blkif     = blkif;
45607 +       pending_req->id        = req->id;
45608 +       pending_req->operation = req->operation;
45609 +       pending_req->status    = BLKIF_RSP_OKAY;
45610 +       pending_req->nr_pages  = nseg;
45611 +       req->id = MAKE_ID(blkif->domid, pending_idx);
45612 +       //atomic_set(&pending_req->pendcnt, nbio);
45613 +       pending_cons++;
45614 +       blkif_get(blkif);
45615 +
45616 +       /* Finally, write the request message to the user ring. */
45617 +       target = RING_GET_REQUEST(&blktap_ufe_ring,
45618 +                                 blktap_ufe_ring.req_prod_pvt);
45619 +       memcpy(target, req, sizeof(*req));
45620 +       blktap_ufe_ring.req_prod_pvt++;
45621 +       return;
45622 +
45623 + bad_descriptor:
45624 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
45625 +} 
45626 +
45627 +
45628 +
45629 +/******************************************************************
45630 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
45631 + */
45632 +
45633 +
45634 +static void make_response(blkif_t *blkif, unsigned long id, 
45635 +                          unsigned short op, int st)
45636 +{
45637 +       blkif_response_t *resp;
45638 +       unsigned long     flags;
45639 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
45640 +
45641 +       /* Place on the response ring for the relevant domain. */ 
45642 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
45643 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
45644 +       resp->id        = id;
45645 +       resp->operation = op;
45646 +       resp->status    = st;
45647 +       wmb(); /* Ensure other side can see the response fields. */
45648 +       blk_ring->rsp_prod_pvt++;
45649 +       RING_PUSH_RESPONSES(blk_ring);
45650 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
45651 +
45652 +       /* Kick the relevant domain. */
45653 +       notify_remote_via_irq(blkif->irq);
45654 +}
45655 +
45656 +static struct miscdevice blktap_miscdev = {
45657 +       .minor        = BLKTAP_MINOR,
45658 +       .name         = "blktap",
45659 +       .fops         = &blktap_fops,
45660 +       .devfs_name   = "misc/blktap",
45661 +};
45662 +
45663 +void blkif_deschedule(blkif_t *blkif)
45664 +{
45665 +       remove_from_blkdev_list(blkif);
45666 +}
45667 +
45668 +static int __init blkif_init(void)
45669 +{
45670 +       int i, j, err;
45671 +       struct page *page;
45672 +
45673 +       blkif_interface_init();
45674 +
45675 +       page = balloon_alloc_empty_page_range(MMAP_PAGES);
45676 +       BUG_ON(page == NULL);
45677 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
45678 +
45679 +       pending_cons = 0;
45680 +       pending_prod = MAX_PENDING_REQS;
45681 +       memset(pending_reqs, 0, sizeof(pending_reqs));
45682 +       for ( i = 0; i < MAX_PENDING_REQS; i++ )
45683 +               pending_ring[i] = i;
45684 +    
45685 +       spin_lock_init(&blkio_schedule_list_lock);
45686 +       INIT_LIST_HEAD(&blkio_schedule_list);
45687 +
45688 +       i = kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES);
45689 +       BUG_ON(i<0);
45690 +
45691 +       blkif_xenbus_init();
45692 +
45693 +       for (i = 0; i < MAX_PENDING_REQS ; i++)
45694 +               for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
45695 +                       BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
45696 +
45697 +       err = misc_register(&blktap_miscdev);
45698 +       if (err != 0) {
45699 +               printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n",
45700 +                      err);
45701 +               return err;
45702 +       }
45703 +
45704 +       init_waitqueue_head(&blktap_wait);
45705 +
45706 +       return 0;
45707 +}
45708 +
45709 +__initcall(blkif_init);
45710 +
45711 +/*
45712 + * Local variables:
45713 + *  c-file-style: "linux"
45714 + *  indent-tabs-mode: t
45715 + *  c-indent-level: 8
45716 + *  c-basic-offset: 8
45717 + *  tab-width: 8
45718 + * End:
45719 + */
45720 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blktap/common.h tmp-linux-2.6-xen.patch/drivers/xen/blktap/common.h
45721 --- ref-linux-2.6.16.9/drivers/xen/blktap/common.h      1970-01-01 01:00:00.000000000 +0100
45722 +++ tmp-linux-2.6-xen.patch/drivers/xen/blktap/common.h 2006-04-10 00:05:52.000000000 +0200
45723 @@ -0,0 +1,110 @@
45724 +
45725 +#ifndef __BLKIF__BACKEND__COMMON_H__
45726 +#define __BLKIF__BACKEND__COMMON_H__
45727 +
45728 +#include <linux/config.h>
45729 +#include <linux/version.h>
45730 +#include <linux/module.h>
45731 +#include <linux/interrupt.h>
45732 +#include <linux/slab.h>
45733 +#include <linux/blkdev.h>
45734 +#include <linux/vmalloc.h>
45735 +#include <asm/io.h>
45736 +#include <asm/setup.h>
45737 +#include <asm/pgalloc.h>
45738 +#include <xen/evtchn.h>
45739 +#include <asm/hypervisor.h>
45740 +#include <xen/interface/io/blkif.h>
45741 +#include <xen/interface/io/ring.h>
45742 +#include <xen/gnttab.h>
45743 +#include <xen/driver_util.h>
45744 +
45745 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
45746 +                                    __FILE__ , __LINE__ , ## _a )
45747 +
45748 +#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
45749 +
45750 +struct vbd {
45751 +       blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
45752 +       unsigned char  readonly;    /* Non-zero -> read-only */
45753 +       unsigned char  type;        /* VDISK_xxx */
45754 +       u32            pdevice;     /* phys device that this vbd maps to */
45755 +       struct block_device *bdev;
45756 +}; 
45757 +
45758 +typedef struct blkif_st {
45759 +       /* Unique identifier for this interface. */
45760 +       domid_t           domid;
45761 +       unsigned int      handle;
45762 +       /* Physical parameters of the comms window. */
45763 +       unsigned int      evtchn;
45764 +       unsigned int      irq;
45765 +       /* Comms information. */
45766 +       blkif_back_ring_t blk_ring;
45767 +       struct vm_struct *blk_ring_area;
45768 +       /* VBDs attached to this interface. */
45769 +       struct vbd        vbd;
45770 +       /* Private fields. */
45771 +       enum { DISCONNECTED, CONNECTED } status;
45772 +#ifdef CONFIG_XEN_BLKDEV_TAP_BE
45773 +       /* Is this a blktap frontend */
45774 +       unsigned int     is_blktap;
45775 +#endif
45776 +       struct list_head blkdev_list;
45777 +       spinlock_t       blk_ring_lock;
45778 +       atomic_t         refcnt;
45779 +
45780 +       struct work_struct free_work;
45781 +
45782 +       grant_handle_t   shmem_handle;
45783 +       grant_ref_t      shmem_ref;
45784 +} blkif_t;
45785 +
45786 +blkif_t *alloc_blkif(domid_t domid);
45787 +void free_blkif_callback(blkif_t *blkif);
45788 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
45789 +
45790 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
45791 +#define blkif_put(_b)                             \
45792 +    do {                                          \
45793 +        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
45794 +            free_blkif_callback(_b);             \
45795 +    } while (0)
45796 +
45797 +/* Create a vbd. */
45798 +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, u32 pdevice,
45799 +              int readonly);
45800 +void vbd_free(struct vbd *vbd);
45801 +
45802 +unsigned long vbd_size(struct vbd *vbd);
45803 +unsigned int vbd_info(struct vbd *vbd);
45804 +unsigned long vbd_secsize(struct vbd *vbd);
45805 +
45806 +struct phys_req {
45807 +       unsigned short       dev;
45808 +       unsigned short       nr_sects;
45809 +       struct block_device *bdev;
45810 +       blkif_sector_t       sector_number;
45811 +};
45812 +
45813 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); 
45814 +
45815 +void blkif_interface_init(void);
45816 +
45817 +void blkif_deschedule(blkif_t *blkif);
45818 +
45819 +void blkif_xenbus_init(void);
45820 +
45821 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
45822 +
45823 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
45824 +
45825 +/*
45826 + * Local variables:
45827 + *  c-file-style: "linux"
45828 + *  indent-tabs-mode: t
45829 + *  c-indent-level: 8
45830 + *  c-basic-offset: 8
45831 + *  tab-width: 8
45832 + * End:
45833 + */
45834 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blktap/interface.c tmp-linux-2.6-xen.patch/drivers/xen/blktap/interface.c
45835 --- ref-linux-2.6.16.9/drivers/xen/blktap/interface.c   1970-01-01 01:00:00.000000000 +0100
45836 +++ tmp-linux-2.6-xen.patch/drivers/xen/blktap/interface.c      2006-04-10 00:05:52.000000000 +0200
45837 @@ -0,0 +1,146 @@
45838 +/******************************************************************************
45839 + * arch/xen/drivers/blkif/backend/interface.c
45840 + * 
45841 + * Block-device interface management.
45842 + * 
45843 + * Copyright (c) 2004, Keir Fraser
45844 + */
45845 +
45846 +#include "common.h"
45847 +#include <xen/evtchn.h>
45848 +
45849 +static kmem_cache_t *blkif_cachep;
45850 +
45851 +blkif_t *alloc_blkif(domid_t domid)
45852 +{
45853 +       blkif_t *blkif;
45854 +
45855 +       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
45856 +       if (!blkif)
45857 +               return ERR_PTR(-ENOMEM);
45858 +
45859 +       memset(blkif, 0, sizeof(*blkif));
45860 +       blkif->domid = domid;
45861 +       blkif->status = DISCONNECTED;
45862 +       spin_lock_init(&blkif->blk_ring_lock);
45863 +       atomic_set(&blkif->refcnt, 1);
45864 +
45865 +       return blkif;
45866 +}
45867 +
45868 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
45869 +{
45870 +       struct gnttab_map_grant_ref op;
45871 +       int ret;
45872 +
45873 +       op.host_addr = (unsigned long)blkif->blk_ring_area->addr;
45874 +       op.flags     = GNTMAP_host_map;
45875 +       op.ref       = shared_page;
45876 +       op.dom       = blkif->domid;
45877 +
45878 +       lock_vm_area(blkif->blk_ring_area);
45879 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
45880 +       unlock_vm_area(blkif->blk_ring_area);
45881 +       BUG_ON(ret);
45882 +
45883 +       if (op.status) {
45884 +               DPRINTK(" Grant table operation failure !\n");
45885 +               return op.status;
45886 +       }
45887 +
45888 +       blkif->shmem_ref    = shared_page;
45889 +       blkif->shmem_handle = op.handle;
45890 +
45891 +       return 0;
45892 +}
45893 +
45894 +static void unmap_frontend_page(blkif_t *blkif)
45895 +{
45896 +       struct gnttab_unmap_grant_ref op;
45897 +       int ret;
45898 +
45899 +       op.host_addr    = (unsigned long)blkif->blk_ring_area->addr;
45900 +       op.handle       = blkif->shmem_handle;
45901 +       op.dev_bus_addr = 0;
45902 +
45903 +       lock_vm_area(blkif->blk_ring_area);
45904 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
45905 +       unlock_vm_area(blkif->blk_ring_area);
45906 +       BUG_ON(ret);
45907 +}
45908 +
45909 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
45910 +{
45911 +       blkif_sring_t *sring;
45912 +       int err;
45913 +       evtchn_op_t op = {
45914 +               .cmd = EVTCHNOP_bind_interdomain,
45915 +               .u.bind_interdomain.remote_dom  = blkif->domid,
45916 +               .u.bind_interdomain.remote_port = evtchn };
45917 +
45918 +       if ((blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL)
45919 +               return -ENOMEM;
45920 +
45921 +       err = map_frontend_page(blkif, shared_page);
45922 +       if (err) {
45923 +               free_vm_area(blkif->blk_ring_area);
45924 +               return err;
45925 +       }
45926 +
45927 +       err = HYPERVISOR_event_channel_op(&op);
45928 +       if (err) {
45929 +               unmap_frontend_page(blkif);
45930 +               free_vm_area(blkif->blk_ring_area);
45931 +               return err;
45932 +       }
45933 +
45934 +       blkif->evtchn = op.u.bind_interdomain.local_port;
45935 +
45936 +       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
45937 +       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
45938 +
45939 +       blkif->irq = bind_evtchn_to_irqhandler(
45940 +               blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
45941 +
45942 +       blkif->status = CONNECTED;
45943 +
45944 +       return 0;
45945 +}
45946 +
45947 +static void free_blkif(void *arg)
45948 +{
45949 +       blkif_t *blkif = (blkif_t *)arg;
45950 +
45951 +       if (blkif->irq)
45952 +               unbind_from_irqhandler(blkif->irq, blkif);
45953 +
45954 +       if (blkif->blk_ring.sring) {
45955 +               unmap_frontend_page(blkif);
45956 +               free_vm_area(blkif->blk_ring_area);
45957 +               blkif->blk_ring.sring = NULL;
45958 +       }
45959 +
45960 +       kmem_cache_free(blkif_cachep, blkif);
45961 +}
45962 +
45963 +void free_blkif_callback(blkif_t *blkif)
45964 +{
45965 +       INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif);
45966 +       schedule_work(&blkif->free_work);
45967 +}
45968 +
45969 +void __init blkif_interface_init(void)
45970 +{
45971 +       blkif_cachep = kmem_cache_create(
45972 +               "blkif_cache", sizeof(blkif_t), 0, 0, NULL, NULL);
45973 +}
45974 +
45975 +/*
45976 + * Local variables:
45977 + *  c-file-style: "linux"
45978 + *  indent-tabs-mode: t
45979 + *  c-indent-level: 8
45980 + *  c-basic-offset: 8
45981 + *  tab-width: 8
45982 + * End:
45983 + */
45984 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blktap/Makefile tmp-linux-2.6-xen.patch/drivers/xen/blktap/Makefile
45985 --- ref-linux-2.6.16.9/drivers/xen/blktap/Makefile      1970-01-01 01:00:00.000000000 +0100
45986 +++ tmp-linux-2.6-xen.patch/drivers/xen/blktap/Makefile 2006-04-10 00:05:52.000000000 +0200
45987 @@ -0,0 +1,3 @@
45988 +
45989 +obj-y  := xenbus.o interface.o blktap.o 
45990 +
45991 diff -Nurp ref-linux-2.6.16.9/drivers/xen/blktap/xenbus.c tmp-linux-2.6-xen.patch/drivers/xen/blktap/xenbus.c
45992 --- ref-linux-2.6.16.9/drivers/xen/blktap/xenbus.c      1970-01-01 01:00:00.000000000 +0100
45993 +++ tmp-linux-2.6-xen.patch/drivers/xen/blktap/xenbus.c 2006-04-10 00:05:52.000000000 +0200
45994 @@ -0,0 +1,233 @@
45995 +/*  Xenbus code for blkif tap
45996 +
45997 +    A Warfield.
45998 +
45999 +    Hastily modified from the oroginal backend code:
46000 +
46001 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
46002 +
46003 +    This program is free software; you can redistribute it and/or modify
46004 +    it under the terms of the GNU General Public License as published by
46005 +    the Free Software Foundation; either version 2 of the License, or
46006 +    (at your option) any later version.
46007 +
46008 +    This program is distributed in the hope that it will be useful,
46009 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
46010 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
46011 +    GNU General Public License for more details.
46012 +
46013 +    You should have received a copy of the GNU General Public License
46014 +    along with this program; if not, write to the Free Software
46015 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46016 +*/
46017 +
46018 +#include <stdarg.h>
46019 +#include <linux/module.h>
46020 +#include <xen/xenbus.h>
46021 +#include "common.h"
46022 +
46023 +struct backend_info
46024 +{
46025 +       struct xenbus_device *dev;
46026 +
46027 +       /* our communications channel */
46028 +       blkif_t *blkif;
46029 +
46030 +       long int frontend_id;
46031 +
46032 +       /* watch back end for changes */
46033 +       struct xenbus_watch backend_watch;
46034 +
46035 +       /* watch front end for changes */
46036 +       struct xenbus_watch watch;
46037 +       char *frontpath;
46038 +};
46039 +
46040 +static int blkback_remove(struct xenbus_device *dev)
46041 +{
46042 +       struct backend_info *be = dev->data;
46043 +
46044 +       if (be->watch.node)
46045 +               unregister_xenbus_watch(&be->watch);
46046 +       unregister_xenbus_watch(&be->backend_watch);
46047 +       if (be->blkif)
46048 +               blkif_put(be->blkif);
46049 +       kfree(be->frontpath);
46050 +       kfree(be);
46051 +       return 0;
46052 +}
46053 +
46054 +/* Front end tells us frame. */
46055 +static void frontend_changed(struct xenbus_watch *watch,
46056 +                            const char **vec, unsigned int len)
46057 +{
46058 +       unsigned long ring_ref;
46059 +       unsigned int evtchn;
46060 +       int err;
46061 +       struct backend_info *be
46062 +               = container_of(watch, struct backend_info, watch);
46063 +
46064 +       /* If other end is gone, delete ourself. */
46065 +       if (vec && !xenbus_exists(be->frontpath, "")) {
46066 +               xenbus_rm(be->dev->nodename, "");
46067 +               device_unregister(&be->dev->dev);
46068 +               return;
46069 +       }
46070 +       if (be->blkif == NULL || be->blkif->status == CONNECTED)
46071 +               return;
46072 +
46073 +       err = xenbus_gather(be->frontpath, "ring-ref", "%lu", &ring_ref,
46074 +                           "event-channel", "%u", &evtchn, NULL);
46075 +       if (err) {
46076 +               xenbus_dev_error(be->dev, err,
46077 +                                "reading %s/ring-ref and event-channel",
46078 +                                be->frontpath);
46079 +               return;
46080 +       }
46081 +
46082 +       /* Map the shared frame, irq etc. */
46083 +       err = blkif_map(be->blkif, ring_ref, evtchn);
46084 +       if (err) {
46085 +               xenbus_dev_error(be->dev, err, "mapping ring-ref %lu port %u",
46086 +                                ring_ref, evtchn);
46087 +               goto abort;
46088 +       }
46089 +
46090 +       xenbus_dev_ok(be->dev);
46091 +
46092 +       return;
46093 +
46094 +abort:
46095 +       xenbus_transaction_end(1);
46096 +}
46097 +
46098 +/* 
46099 +   Setup supplies physical device.  
46100 +   We provide event channel and device details to front end.
46101 +   Frontend supplies shared frame and event channel.
46102 + */
46103 +static void backend_changed(struct xenbus_watch *watch,
46104 +                           const char **vec, unsigned int len)
46105 +{
46106 +       int err;
46107 +       char *p;
46108 +       long int handle;
46109 +       struct backend_info *be
46110 +               = container_of(watch, struct backend_info, backend_watch);
46111 +       struct xenbus_device *dev = be->dev;
46112 +
46113 +       if (be->blkif == NULL) {
46114 +               /* Front end dir is a number, which is used as the handle. */
46115 +               p = strrchr(be->frontpath, '/') + 1;
46116 +               handle = simple_strtoul(p, NULL, 0);
46117 +
46118 +               be->blkif = alloc_blkif(be->frontend_id);
46119 +               if (IS_ERR(be->blkif)) {
46120 +                       err = PTR_ERR(be->blkif);
46121 +                       be->blkif = NULL;
46122 +                       xenbus_dev_error(dev, err, "creating block interface");
46123 +                       return;
46124 +               }
46125 +
46126 +               /* Pass in NULL node to skip exist test. */
46127 +               frontend_changed(&be->watch, NULL, 0);
46128 +       }
46129 +}
46130 +
46131 +static int blkback_probe(struct xenbus_device *dev,
46132 +                        const struct xenbus_device_id *id)
46133 +{
46134 +       struct backend_info *be;
46135 +       char *frontend;
46136 +       int err;
46137 +
46138 +       be = kzalloc(sizeof(*be), GFP_KERNEL);
46139 +       if (!be) {
46140 +               xenbus_dev_error(dev, -ENOMEM, "allocating backend structure");
46141 +               return -ENOMEM;
46142 +       }
46143 +
46144 +       frontend = NULL;
46145 +       err = xenbus_gather(dev->nodename,
46146 +                           "frontend-id", "%li", &be->frontend_id,
46147 +                           "frontend", NULL, &frontend,
46148 +                           NULL);
46149 +       if (XENBUS_EXIST_ERR(err))
46150 +               goto free_be;
46151 +       if (err < 0) {
46152 +               xenbus_dev_error(dev, err,
46153 +                                "reading %s/frontend or frontend-id",
46154 +                                dev->nodename);
46155 +               goto free_be;
46156 +       }
46157 +       if (strlen(frontend) == 0 || !xenbus_exists(frontend, "")) {
46158 +               /* If we can't get a frontend path and a frontend-id,
46159 +                * then our bus-id is no longer valid and we need to
46160 +                * destroy the backend device.
46161 +                */
46162 +               err = -ENOENT;
46163 +               goto free_be;
46164 +       }
46165 +
46166 +       be->dev = dev;
46167 +       be->backend_watch.node = dev->nodename;
46168 +       be->backend_watch.callback = backend_changed;
46169 +       /* Registration implicitly fires backend_changed once */
46170 +       err = register_xenbus_watch(&be->backend_watch);
46171 +       if (err) {
46172 +               be->backend_watch.node = NULL;
46173 +               xenbus_dev_error(dev, err, "adding backend watch on %s",
46174 +                                dev->nodename);
46175 +               goto free_be;
46176 +       }
46177 +
46178 +       be->frontpath = frontend;
46179 +       be->watch.node = be->frontpath;
46180 +       be->watch.callback = frontend_changed;
46181 +       err = register_xenbus_watch(&be->watch);
46182 +       if (err) {
46183 +               be->watch.node = NULL;
46184 +               xenbus_dev_error(dev, err,
46185 +                                "adding frontend watch on %s",
46186 +                                be->frontpath);
46187 +               goto free_be;
46188 +       }
46189 +
46190 +       dev->data = be;
46191 +       return 0;
46192 +
46193 + free_be:
46194 +       if (be->backend_watch.node)
46195 +               unregister_xenbus_watch(&be->backend_watch);
46196 +       kfree(frontend);
46197 +       kfree(be);
46198 +       return err;
46199 +}
46200 +
46201 +static struct xenbus_device_id blkback_ids[] = {
46202 +       { "vbd" },
46203 +       { "" }
46204 +};
46205 +
46206 +static struct xenbus_driver blkback = {
46207 +       .name = "vbd",
46208 +       .owner = THIS_MODULE,
46209 +       .ids = blkback_ids,
46210 +       .probe = blkback_probe,
46211 +       .remove = blkback_remove,
46212 +};
46213 +
46214 +void blkif_xenbus_init(void)
46215 +{
46216 +       xenbus_register_backend(&blkback);
46217 +}
46218 +
46219 +/*
46220 + * Local variables:
46221 + *  c-file-style: "linux"
46222 + *  indent-tabs-mode: t
46223 + *  c-indent-level: 8
46224 + *  c-basic-offset: 8
46225 + *  tab-width: 8
46226 + * End:
46227 + */
46228 diff -Nurp ref-linux-2.6.16.9/drivers/xen/char/Makefile tmp-linux-2.6-xen.patch/drivers/xen/char/Makefile
46229 --- ref-linux-2.6.16.9/drivers/xen/char/Makefile        1970-01-01 01:00:00.000000000 +0100
46230 +++ tmp-linux-2.6-xen.patch/drivers/xen/char/Makefile   2006-04-10 00:05:52.000000000 +0200
46231 @@ -0,0 +1,2 @@
46232 +
46233 +obj-y  := mem.o
46234 diff -Nurp ref-linux-2.6.16.9/drivers/xen/char/mem.c tmp-linux-2.6-xen.patch/drivers/xen/char/mem.c
46235 --- ref-linux-2.6.16.9/drivers/xen/char/mem.c   1970-01-01 01:00:00.000000000 +0100
46236 +++ tmp-linux-2.6-xen.patch/drivers/xen/char/mem.c      2006-04-10 00:05:52.000000000 +0200
46237 @@ -0,0 +1,192 @@
46238 +/*
46239 + *  Originally from linux/drivers/char/mem.c
46240 + *
46241 + *  Copyright (C) 1991, 1992  Linus Torvalds
46242 + *
46243 + *  Added devfs support. 
46244 + *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
46245 + *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
46246 + */
46247 +
46248 +#include <linux/config.h>
46249 +#include <linux/mm.h>
46250 +#include <linux/miscdevice.h>
46251 +#include <linux/slab.h>
46252 +#include <linux/vmalloc.h>
46253 +#include <linux/mman.h>
46254 +#include <linux/random.h>
46255 +#include <linux/init.h>
46256 +#include <linux/raw.h>
46257 +#include <linux/tty.h>
46258 +#include <linux/capability.h>
46259 +#include <linux/smp_lock.h>
46260 +#include <linux/devfs_fs_kernel.h>
46261 +#include <linux/ptrace.h>
46262 +#include <linux/device.h>
46263 +#include <asm/pgalloc.h>
46264 +#include <asm/uaccess.h>
46265 +#include <asm/io.h>
46266 +#include <asm/hypervisor.h>
46267 +
46268 +static inline int uncached_access(struct file *file)
46269 +{
46270 +        if (file->f_flags & O_SYNC)
46271 +                return 1;
46272 +        /* Xen sets correct MTRR type on non-RAM for us. */
46273 +        return 0;
46274 +}
46275 +
46276 +/*
46277 + * This funcion reads the *physical* memory. The f_pos points directly to the 
46278 + * memory location. 
46279 + */
46280 +static ssize_t read_mem(struct file * file, char __user * buf,
46281 +                       size_t count, loff_t *ppos)
46282 +{
46283 +       unsigned long p = *ppos, ignored;
46284 +       ssize_t read = 0, sz;
46285 +       void __iomem *v;
46286 +
46287 +       while (count > 0) {
46288 +               /*
46289 +                * Handle first page in case it's not aligned
46290 +                */
46291 +               if (-p & (PAGE_SIZE - 1))
46292 +                       sz = -p & (PAGE_SIZE - 1);
46293 +               else
46294 +                       sz = PAGE_SIZE;
46295 +
46296 +               sz = min_t(unsigned long, sz, count);
46297 +
46298 +               if ((v = ioremap(p, sz)) == NULL) {
46299 +                       /*
46300 +                        * Some programs (e.g., dmidecode) groove off into weird RAM
46301 +                        * areas where no tables can possibly exist (because Xen will
46302 +                        * have stomped on them!). These programs get rather upset if
46303 +                        * we let them know that Xen failed their access, so we fake
46304 +                        * out a read of all zeroes. :-)
46305 +                        */
46306 +                       if (clear_user(buf, count))
46307 +                               return -EFAULT;
46308 +                       read += count;
46309 +                       break;
46310 +               }
46311 +
46312 +               ignored = copy_to_user(buf, v, sz);
46313 +               iounmap(v);
46314 +               if (ignored)
46315 +                       return -EFAULT;
46316 +               buf += sz;
46317 +               p += sz;
46318 +               count -= sz;
46319 +               read += sz;
46320 +       }
46321 +
46322 +       *ppos += read;
46323 +       return read;
46324 +}
46325 +
46326 +static ssize_t write_mem(struct file * file, const char __user * buf, 
46327 +                        size_t count, loff_t *ppos)
46328 +{
46329 +       unsigned long p = *ppos, ignored;
46330 +       ssize_t written = 0, sz;
46331 +       void __iomem *v;
46332 +
46333 +       while (count > 0) {
46334 +               /*
46335 +                * Handle first page in case it's not aligned
46336 +                */
46337 +               if (-p & (PAGE_SIZE - 1))
46338 +                       sz = -p & (PAGE_SIZE - 1);
46339 +               else
46340 +                       sz = PAGE_SIZE;
46341 +
46342 +               sz = min_t(unsigned long, sz, count);
46343 +
46344 +               if ((v = ioremap(p, sz)) == NULL)
46345 +                       break;
46346 +
46347 +               ignored = copy_from_user(v, buf, sz);
46348 +               iounmap(v);
46349 +               if (ignored) {
46350 +                       written += sz - ignored;
46351 +                       if (written)
46352 +                               break;
46353 +                       return -EFAULT;
46354 +               }
46355 +               buf += sz;
46356 +               p += sz;
46357 +               count -= sz;
46358 +               written += sz;
46359 +       }
46360 +
46361 +       *ppos += written;
46362 +       return written;
46363 +}
46364 +
46365 +static int mmap_mem(struct file * file, struct vm_area_struct * vma)
46366 +{
46367 +       size_t size = vma->vm_end - vma->vm_start;
46368 +
46369 +       if (uncached_access(file))
46370 +               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
46371 +
46372 +       /* We want to return the real error code, not EAGAIN. */
46373 +       return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
46374 +                                     size, vma->vm_page_prot, DOMID_IO);
46375 +}
46376 +
46377 +/*
46378 + * The memory devices use the full 32/64 bits of the offset, and so we cannot
46379 + * check against negative addresses: they are ok. The return value is weird,
46380 + * though, in that case (0).
46381 + *
46382 + * also note that seeking relative to the "end of file" isn't supported:
46383 + * it has no meaning, so it returns -EINVAL.
46384 + */
46385 +static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
46386 +{
46387 +       loff_t ret;
46388 +
46389 +       mutex_lock(&file->f_dentry->d_inode->i_mutex);
46390 +       switch (orig) {
46391 +               case 0:
46392 +                       file->f_pos = offset;
46393 +                       ret = file->f_pos;
46394 +                       force_successful_syscall_return();
46395 +                       break;
46396 +               case 1:
46397 +                       file->f_pos += offset;
46398 +                       ret = file->f_pos;
46399 +                       force_successful_syscall_return();
46400 +                       break;
46401 +               default:
46402 +                       ret = -EINVAL;
46403 +       }
46404 +       mutex_unlock(&file->f_dentry->d_inode->i_mutex);
46405 +       return ret;
46406 +}
46407 +
46408 +static int open_mem(struct inode * inode, struct file * filp)
46409 +{
46410 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
46411 +}
46412 +
46413 +struct file_operations mem_fops = {
46414 +       .llseek         = memory_lseek,
46415 +       .read           = read_mem,
46416 +       .write          = write_mem,
46417 +       .mmap           = mmap_mem,
46418 +       .open           = open_mem,
46419 +};
46420 +
46421 +/*
46422 + * Local variables:
46423 + *  c-file-style: "linux"
46424 + *  indent-tabs-mode: t
46425 + *  c-indent-level: 8
46426 + *  c-basic-offset: 8
46427 + *  tab-width: 8
46428 + * End:
46429 + */
46430 diff -Nurp ref-linux-2.6.16.9/drivers/xen/console/console.c tmp-linux-2.6-xen.patch/drivers/xen/console/console.c
46431 --- ref-linux-2.6.16.9/drivers/xen/console/console.c    1970-01-01 01:00:00.000000000 +0100
46432 +++ tmp-linux-2.6-xen.patch/drivers/xen/console/console.c       2006-04-10 00:05:52.000000000 +0200
46433 @@ -0,0 +1,648 @@
46434 +/******************************************************************************
46435 + * console.c
46436 + * 
46437 + * Virtual console driver.
46438 + * 
46439 + * Copyright (c) 2002-2004, K A Fraser.
46440 + * 
46441 + * This program is free software; you can redistribute it and/or
46442 + * modify it under the terms of the GNU General Public License version 2
46443 + * as published by the Free Software Foundation; or, when distributed
46444 + * separately from the Linux kernel or incorporated into other
46445 + * software packages, subject to the following license:
46446 + * 
46447 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46448 + * of this source file (the "Software"), to deal in the Software without
46449 + * restriction, including without limitation the rights to use, copy, modify,
46450 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46451 + * and to permit persons to whom the Software is furnished to do so, subject to
46452 + * the following conditions:
46453 + * 
46454 + * The above copyright notice and this permission notice shall be included in
46455 + * all copies or substantial portions of the Software.
46456 + * 
46457 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46458 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46459 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46460 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46461 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46462 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46463 + * IN THE SOFTWARE.
46464 + */
46465 +
46466 +#include <linux/config.h>
46467 +#include <linux/version.h>
46468 +#include <linux/module.h>
46469 +#include <linux/errno.h>
46470 +#include <linux/signal.h>
46471 +#include <linux/sched.h>
46472 +#include <linux/interrupt.h>
46473 +#include <linux/tty.h>
46474 +#include <linux/tty_flip.h>
46475 +#include <linux/serial.h>
46476 +#include <linux/major.h>
46477 +#include <linux/ptrace.h>
46478 +#include <linux/ioport.h>
46479 +#include <linux/mm.h>
46480 +#include <linux/slab.h>
46481 +#include <linux/init.h>
46482 +#include <linux/console.h>
46483 +#include <linux/bootmem.h>
46484 +#include <linux/sysrq.h>
46485 +#include <asm/io.h>
46486 +#include <asm/irq.h>
46487 +#include <asm/uaccess.h>
46488 +#include <xen/interface/xen.h>
46489 +#include <xen/interface/event_channel.h>
46490 +#include <asm/hypervisor.h>
46491 +#include <xen/evtchn.h>
46492 +#include <xen/xencons.h>
46493 +
46494 +/*
46495 + * Modes:
46496 + *  'xencons=off'  [XC_OFF]:     Console is disabled.
46497 + *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
46498 + *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
46499 + *                 [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY.
46500 + * 
46501 + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
46502 + * warnings from standard distro startup scripts.
46503 + */
46504 +static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT;
46505 +static int xc_num = -1;
46506 +
46507 +#ifdef CONFIG_MAGIC_SYSRQ
46508 +static unsigned long sysrq_requested;
46509 +extern int sysrq_enabled;
46510 +#endif
46511 +
46512 +static int __init xencons_setup(char *str)
46513 +{
46514 +       char *q;
46515 +       int n;
46516 +
46517 +       if (!strncmp(str, "ttyS", 4))
46518 +               xc_mode = XC_SERIAL;
46519 +       else if (!strncmp(str, "tty", 3))
46520 +               xc_mode = XC_TTY;
46521 +       else if (!strncmp(str, "off", 3))
46522 +               xc_mode = XC_OFF;
46523 +
46524 +       switch (xc_mode) {
46525 +       case XC_SERIAL:
46526 +               n = simple_strtol(str+4, &q, 10);
46527 +               if (q > (str + 4))
46528 +                       xc_num = n;
46529 +               break;
46530 +       case XC_TTY:
46531 +               n = simple_strtol(str+3, &q, 10);
46532 +               if (q > (str + 3))
46533 +                       xc_num = n;
46534 +               break;
46535 +       default:
46536 +               break;
46537 +       }
46538 +
46539 +       return 1;
46540 +}
46541 +__setup("xencons=", xencons_setup);
46542 +
46543 +/* The kernel and user-land drivers share a common transmit buffer. */
46544 +static unsigned int wbuf_size = 4096;
46545 +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
46546 +static char *wbuf;
46547 +static unsigned int wc, wp; /* write_cons, write_prod */
46548 +
46549 +static int __init xencons_bufsz_setup(char *str)
46550 +{
46551 +       unsigned int goal;
46552 +       goal = simple_strtoul(str, NULL, 0);
46553 +       while (wbuf_size < goal)
46554 +               wbuf_size <<= 1;
46555 +       return 1;
46556 +}
46557 +__setup("xencons_bufsz=", xencons_bufsz_setup);
46558 +
46559 +/* This lock protects accesses to the common transmit buffer. */
46560 +static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED;
46561 +
46562 +/* Common transmit-kick routine. */
46563 +static void __xencons_tx_flush(void);
46564 +
46565 +static struct tty_driver *xencons_driver;
46566 +
46567 +/******************** Kernel console driver ********************************/
46568 +
46569 +static void kcons_write(
46570 +       struct console *c, const char *s, unsigned int count)
46571 +{
46572 +       int           i = 0;
46573 +       unsigned long flags;
46574 +
46575 +       spin_lock_irqsave(&xencons_lock, flags);
46576 +
46577 +       while (i < count) {
46578 +               for (; i < count; i++) {
46579 +                       if ((wp - wc) >= (wbuf_size - 1))
46580 +                               break;
46581 +                       if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
46582 +                               wbuf[WBUF_MASK(wp++)] = '\r';
46583 +               }
46584 +
46585 +               __xencons_tx_flush();
46586 +       }
46587 +
46588 +       spin_unlock_irqrestore(&xencons_lock, flags);
46589 +}
46590 +
46591 +static void kcons_write_dom0(
46592 +       struct console *c, const char *s, unsigned int count)
46593 +{
46594 +       int rc;
46595 +
46596 +       while ((count > 0) &&
46597 +              ((rc = HYPERVISOR_console_io(
46598 +                       CONSOLEIO_write, count, (char *)s)) > 0)) {
46599 +               count -= rc;
46600 +               s += rc;
46601 +       }
46602 +}
46603 +
46604 +static struct tty_driver *kcons_device(struct console *c, int *index)
46605 +{
46606 +       *index = 0;
46607 +       return xencons_driver;
46608 +}
46609 +
46610 +static struct console kcons_info = {
46611 +       .device = kcons_device,
46612 +       .flags  = CON_PRINTBUFFER,
46613 +       .index  = -1,
46614 +};
46615 +
46616 +#define __RETCODE 0
46617 +static int __init xen_console_init(void)
46618 +{
46619 +       if (xen_init() < 0)
46620 +               return __RETCODE;
46621 +
46622 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
46623 +               if (xc_mode == XC_DEFAULT)
46624 +                       xc_mode = XC_SERIAL;
46625 +               kcons_info.write = kcons_write_dom0;
46626 +               if (xc_mode == XC_SERIAL)
46627 +                       kcons_info.flags |= CON_ENABLED;
46628 +       } else {
46629 +               if (xc_mode == XC_DEFAULT)
46630 +                       xc_mode = XC_TTY;
46631 +               kcons_info.write = kcons_write;
46632 +       }
46633 +
46634 +       switch (xc_mode) {
46635 +       case XC_SERIAL:
46636 +               strcpy(kcons_info.name, "ttyS");
46637 +               if (xc_num == -1)
46638 +                       xc_num = 0;
46639 +               break;
46640 +
46641 +       case XC_TTY:
46642 +               strcpy(kcons_info.name, "tty");
46643 +               if (xc_num == -1)
46644 +                       xc_num = 1;
46645 +               break;
46646 +
46647 +       default:
46648 +               return __RETCODE;
46649 +       }
46650 +
46651 +       wbuf = alloc_bootmem(wbuf_size);
46652 +
46653 +       register_console(&kcons_info);
46654 +
46655 +       return __RETCODE;
46656 +}
46657 +console_initcall(xen_console_init);
46658 +
46659 +/*** Useful function for console debugging -- goes straight to Xen. ***/
46660 +asmlinkage int xprintk(const char *fmt, ...)
46661 +{
46662 +       va_list args;
46663 +       int printk_len;
46664 +       static char printk_buf[1024];
46665 +
46666 +       /* Emit the output into the temporary buffer */
46667 +       va_start(args, fmt);
46668 +       printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
46669 +       va_end(args);
46670 +
46671 +       /* Send the processed output directly to Xen. */
46672 +       kcons_write_dom0(NULL, printk_buf, printk_len);
46673 +
46674 +       return 0;
46675 +}
46676 +
46677 +/*** Forcibly flush console data before dying. ***/
46678 +void xencons_force_flush(void)
46679 +{
46680 +       int sz;
46681 +
46682 +       /* Emergency console is synchronous, so there's nothing to flush. */
46683 +       if (xen_start_info->flags & SIF_INITDOMAIN)
46684 +               return;
46685 +
46686 +       /* Spin until console data is flushed through to the daemon. */
46687 +       while (wc != wp) {
46688 +               int sent = 0;
46689 +               if ((sz = wp - wc) == 0)
46690 +                       continue;
46691 +               sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
46692 +               if (sent > 0)
46693 +                       wc += sent;
46694 +       }
46695 +}
46696 +
46697 +
46698 +/******************** User-space console driver (/dev/console) ************/
46699 +
46700 +#define DRV(_d)         (_d)
46701 +#define TTY_INDEX(_tty) ((_tty)->index)
46702 +
46703 +static struct termios *xencons_termios[MAX_NR_CONSOLES];
46704 +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
46705 +static struct tty_struct *xencons_tty;
46706 +static int xencons_priv_irq;
46707 +static char x_char;
46708 +
46709 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
46710 +{
46711 +       int           i;
46712 +       unsigned long flags;
46713 +
46714 +       spin_lock_irqsave(&xencons_lock, flags);
46715 +       if (xencons_tty == NULL)
46716 +               goto out;
46717 +
46718 +       for (i = 0; i < len; i++) {
46719 +#ifdef CONFIG_MAGIC_SYSRQ
46720 +               if (sysrq_enabled) {
46721 +                       if (buf[i] == '\x0f') { /* ^O */
46722 +                               sysrq_requested = jiffies;
46723 +                               continue; /* don't print the sysrq key */
46724 +                       } else if (sysrq_requested) {
46725 +                               unsigned long sysrq_timeout =
46726 +                                       sysrq_requested + HZ*2;
46727 +                               sysrq_requested = 0;
46728 +                               if (time_before(jiffies, sysrq_timeout)) {
46729 +                                       spin_unlock_irqrestore(
46730 +                                               &xencons_lock, flags);
46731 +                                       handle_sysrq(
46732 +                                               buf[i], regs, xencons_tty);
46733 +                                       spin_lock_irqsave(
46734 +                                               &xencons_lock, flags);
46735 +                                       continue;
46736 +                               }
46737 +                       }
46738 +               }
46739 +#endif
46740 +               tty_insert_flip_char(xencons_tty, buf[i], 0);
46741 +       }
46742 +       tty_flip_buffer_push(xencons_tty);
46743 +
46744 + out:
46745 +       spin_unlock_irqrestore(&xencons_lock, flags);
46746 +}
46747 +
46748 +static void __xencons_tx_flush(void)
46749 +{
46750 +       int sent, sz, work_done = 0;
46751 +
46752 +       if (x_char) {
46753 +               if (xen_start_info->flags & SIF_INITDOMAIN)
46754 +                       kcons_write_dom0(NULL, &x_char, 1);
46755 +               else
46756 +                       while (x_char)
46757 +                               if (xencons_ring_send(&x_char, 1) == 1)
46758 +                                       break;
46759 +               x_char = 0;
46760 +               work_done = 1;
46761 +       }
46762 +
46763 +       while (wc != wp) {
46764 +               sz = wp - wc;
46765 +               if (sz > (wbuf_size - WBUF_MASK(wc)))
46766 +                       sz = wbuf_size - WBUF_MASK(wc);
46767 +               if (xen_start_info->flags & SIF_INITDOMAIN) {
46768 +                       kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
46769 +                       wc += sz;
46770 +               } else {
46771 +                       sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
46772 +                       if (sent == 0)
46773 +                               break;
46774 +                       wc += sent;
46775 +               }
46776 +               work_done = 1;
46777 +       }
46778 +
46779 +       if (work_done && (xencons_tty != NULL)) {
46780 +               wake_up_interruptible(&xencons_tty->write_wait);
46781 +               if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
46782 +                   (xencons_tty->ldisc.write_wakeup != NULL))
46783 +                       (xencons_tty->ldisc.write_wakeup)(xencons_tty);
46784 +       }
46785 +}
46786 +
46787 +void xencons_tx(void)
46788 +{
46789 +       unsigned long flags;
46790 +
46791 +       spin_lock_irqsave(&xencons_lock, flags);
46792 +       __xencons_tx_flush();
46793 +       spin_unlock_irqrestore(&xencons_lock, flags);
46794 +}
46795 +
46796 +/* Privileged receive callback and transmit kicker. */
46797 +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
46798 +                                          struct pt_regs *regs)
46799 +{
46800 +       static char rbuf[16];
46801 +       int         l;
46802 +
46803 +       while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
46804 +               xencons_rx(rbuf, l, regs);
46805 +
46806 +       xencons_tx();
46807 +
46808 +       return IRQ_HANDLED;
46809 +}
46810 +
46811 +static int xencons_write_room(struct tty_struct *tty)
46812 +{
46813 +       return wbuf_size - (wp - wc);
46814 +}
46815 +
46816 +static int xencons_chars_in_buffer(struct tty_struct *tty)
46817 +{
46818 +       return wp - wc;
46819 +}
46820 +
46821 +static void xencons_send_xchar(struct tty_struct *tty, char ch)
46822 +{
46823 +       unsigned long flags;
46824 +
46825 +       if (TTY_INDEX(tty) != 0)
46826 +               return;
46827 +
46828 +       spin_lock_irqsave(&xencons_lock, flags);
46829 +       x_char = ch;
46830 +       __xencons_tx_flush();
46831 +       spin_unlock_irqrestore(&xencons_lock, flags);
46832 +}
46833 +
46834 +static void xencons_throttle(struct tty_struct *tty)
46835 +{
46836 +       if (TTY_INDEX(tty) != 0)
46837 +               return;
46838 +
46839 +       if (I_IXOFF(tty))
46840 +               xencons_send_xchar(tty, STOP_CHAR(tty));
46841 +}
46842 +
46843 +static void xencons_unthrottle(struct tty_struct *tty)
46844 +{
46845 +       if (TTY_INDEX(tty) != 0)
46846 +               return;
46847 +
46848 +       if (I_IXOFF(tty)) {
46849 +               if (x_char != 0)
46850 +                       x_char = 0;
46851 +               else
46852 +                       xencons_send_xchar(tty, START_CHAR(tty));
46853 +       }
46854 +}
46855 +
46856 +static void xencons_flush_buffer(struct tty_struct *tty)
46857 +{
46858 +       unsigned long flags;
46859 +
46860 +       if (TTY_INDEX(tty) != 0)
46861 +               return;
46862 +
46863 +       spin_lock_irqsave(&xencons_lock, flags);
46864 +       wc = wp = 0;
46865 +       spin_unlock_irqrestore(&xencons_lock, flags);
46866 +}
46867 +
46868 +static inline int __xencons_put_char(int ch)
46869 +{
46870 +       char _ch = (char)ch;
46871 +       if ((wp - wc) == wbuf_size)
46872 +               return 0;
46873 +       wbuf[WBUF_MASK(wp++)] = _ch;
46874 +       return 1;
46875 +}
46876 +
46877 +static int xencons_write(
46878 +       struct tty_struct *tty,
46879 +       const unsigned char *buf,
46880 +       int count)
46881 +{
46882 +       int i;
46883 +       unsigned long flags;
46884 +
46885 +       if (TTY_INDEX(tty) != 0)
46886 +               return count;
46887 +
46888 +       spin_lock_irqsave(&xencons_lock, flags);
46889 +
46890 +       for (i = 0; i < count; i++)
46891 +               if (!__xencons_put_char(buf[i]))
46892 +                       break;
46893 +
46894 +       if (i != 0)
46895 +               __xencons_tx_flush();
46896 +
46897 +       spin_unlock_irqrestore(&xencons_lock, flags);
46898 +
46899 +       return i;
46900 +}
46901 +
46902 +static void xencons_put_char(struct tty_struct *tty, u_char ch)
46903 +{
46904 +       unsigned long flags;
46905 +
46906 +       if (TTY_INDEX(tty) != 0)
46907 +               return;
46908 +
46909 +       spin_lock_irqsave(&xencons_lock, flags);
46910 +       (void)__xencons_put_char(ch);
46911 +       spin_unlock_irqrestore(&xencons_lock, flags);
46912 +}
46913 +
46914 +static void xencons_flush_chars(struct tty_struct *tty)
46915 +{
46916 +       unsigned long flags;
46917 +
46918 +       if (TTY_INDEX(tty) != 0)
46919 +               return;
46920 +
46921 +       spin_lock_irqsave(&xencons_lock, flags);
46922 +       __xencons_tx_flush();
46923 +       spin_unlock_irqrestore(&xencons_lock, flags);
46924 +}
46925 +
46926 +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
46927 +{
46928 +       unsigned long orig_jiffies = jiffies;
46929 +
46930 +       if (TTY_INDEX(tty) != 0)
46931 +               return;
46932 +
46933 +       while (DRV(tty->driver)->chars_in_buffer(tty)) {
46934 +               set_current_state(TASK_INTERRUPTIBLE);
46935 +               schedule_timeout(1);
46936 +               if (signal_pending(current))
46937 +                       break;
46938 +               if (timeout && time_after(jiffies, orig_jiffies + timeout))
46939 +                       break;
46940 +       }
46941 +
46942 +       set_current_state(TASK_RUNNING);
46943 +}
46944 +
46945 +static int xencons_open(struct tty_struct *tty, struct file *filp)
46946 +{
46947 +       unsigned long flags;
46948 +
46949 +       if (TTY_INDEX(tty) != 0)
46950 +               return 0;
46951 +
46952 +       spin_lock_irqsave(&xencons_lock, flags);
46953 +       tty->driver_data = NULL;
46954 +       if (xencons_tty == NULL)
46955 +               xencons_tty = tty;
46956 +       __xencons_tx_flush();
46957 +       spin_unlock_irqrestore(&xencons_lock, flags);
46958 +
46959 +       return 0;
46960 +}
46961 +
46962 +static void xencons_close(struct tty_struct *tty, struct file *filp)
46963 +{
46964 +       unsigned long flags;
46965 +
46966 +       if (TTY_INDEX(tty) != 0)
46967 +               return;
46968 +
46969 +       if (tty->count == 1) {
46970 +               tty->closing = 1;
46971 +               tty_wait_until_sent(tty, 0);
46972 +               if (DRV(tty->driver)->flush_buffer != NULL)
46973 +                       DRV(tty->driver)->flush_buffer(tty);
46974 +               if (tty->ldisc.flush_buffer != NULL)
46975 +                       tty->ldisc.flush_buffer(tty);
46976 +               tty->closing = 0;
46977 +               spin_lock_irqsave(&xencons_lock, flags);
46978 +               xencons_tty = NULL;
46979 +               spin_unlock_irqrestore(&xencons_lock, flags);
46980 +       }
46981 +}
46982 +
46983 +static struct tty_operations xencons_ops = {
46984 +       .open = xencons_open,
46985 +       .close = xencons_close,
46986 +       .write = xencons_write,
46987 +       .write_room = xencons_write_room,
46988 +       .put_char = xencons_put_char,
46989 +       .flush_chars = xencons_flush_chars,
46990 +       .chars_in_buffer = xencons_chars_in_buffer,
46991 +       .send_xchar = xencons_send_xchar,
46992 +       .flush_buffer = xencons_flush_buffer,
46993 +       .throttle = xencons_throttle,
46994 +       .unthrottle = xencons_unthrottle,
46995 +       .wait_until_sent = xencons_wait_until_sent,
46996 +};
46997 +
46998 +static int __init xencons_init(void)
46999 +{
47000 +       int rc;
47001 +
47002 +       if (xen_init() < 0)
47003 +               return -ENODEV;
47004 +
47005 +       if (xc_mode == XC_OFF)
47006 +               return 0;
47007 +
47008 +       xencons_ring_init();
47009 +
47010 +       xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ?
47011 +                                         1 : MAX_NR_CONSOLES);
47012 +       if (xencons_driver == NULL)
47013 +               return -ENOMEM;
47014 +
47015 +       DRV(xencons_driver)->name            = "xencons";
47016 +       DRV(xencons_driver)->major           = TTY_MAJOR;
47017 +       DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
47018 +       DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
47019 +       DRV(xencons_driver)->init_termios    = tty_std_termios;
47020 +       DRV(xencons_driver)->flags           =
47021 +               TTY_DRIVER_REAL_RAW |
47022 +               TTY_DRIVER_RESET_TERMIOS |
47023 +               TTY_DRIVER_NO_DEVFS;
47024 +       DRV(xencons_driver)->termios         = xencons_termios;
47025 +       DRV(xencons_driver)->termios_locked  = xencons_termios_locked;
47026 +
47027 +       if (xc_mode == XC_SERIAL) {
47028 +               DRV(xencons_driver)->name        = "ttyS";
47029 +               DRV(xencons_driver)->minor_start = 64 + xc_num;
47030 +               DRV(xencons_driver)->name_base   = 0 + xc_num;
47031 +       } else {
47032 +               DRV(xencons_driver)->name        = "tty";
47033 +               DRV(xencons_driver)->minor_start = xc_num;
47034 +               DRV(xencons_driver)->name_base   = xc_num;
47035 +       }
47036 +
47037 +       tty_set_operations(xencons_driver, &xencons_ops);
47038 +
47039 +       if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
47040 +               printk("WARNING: Failed to register Xen virtual "
47041 +                      "console driver as '%s%d'\n",
47042 +                      DRV(xencons_driver)->name,
47043 +                      DRV(xencons_driver)->name_base);
47044 +               put_tty_driver(xencons_driver);
47045 +               xencons_driver = NULL;
47046 +               return rc;
47047 +       }
47048 +
47049 +       tty_register_device(xencons_driver, 0, NULL);
47050 +
47051 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
47052 +               xencons_priv_irq = bind_virq_to_irqhandler(
47053 +                       VIRQ_CONSOLE,
47054 +                       0,
47055 +                       xencons_priv_interrupt,
47056 +                       0,
47057 +                       "console",
47058 +                       NULL);
47059 +               BUG_ON(xencons_priv_irq < 0);
47060 +       }
47061 +
47062 +       printk("Xen virtual console successfully installed as %s%d\n",
47063 +              DRV(xencons_driver)->name,
47064 +              DRV(xencons_driver)->name_base );
47065 +
47066 +       return 0;
47067 +}
47068 +
47069 +module_init(xencons_init);
47070 +
47071 +MODULE_LICENSE("Dual BSD/GPL");
47072 +
47073 +/*
47074 + * Local variables:
47075 + *  c-file-style: "linux"
47076 + *  indent-tabs-mode: t
47077 + *  c-indent-level: 8
47078 + *  c-basic-offset: 8
47079 + *  tab-width: 8
47080 + * End:
47081 + */
47082 diff -Nurp ref-linux-2.6.16.9/drivers/xen/console/Makefile tmp-linux-2.6-xen.patch/drivers/xen/console/Makefile
47083 --- ref-linux-2.6.16.9/drivers/xen/console/Makefile     1970-01-01 01:00:00.000000000 +0100
47084 +++ tmp-linux-2.6-xen.patch/drivers/xen/console/Makefile        2006-04-10 00:05:52.000000000 +0200
47085 @@ -0,0 +1,2 @@
47086 +
47087 +obj-y  := console.o xencons_ring.o
47088 diff -Nurp ref-linux-2.6.16.9/drivers/xen/console/xencons_ring.c tmp-linux-2.6-xen.patch/drivers/xen/console/xencons_ring.c
47089 --- ref-linux-2.6.16.9/drivers/xen/console/xencons_ring.c       1970-01-01 01:00:00.000000000 +0100
47090 +++ tmp-linux-2.6-xen.patch/drivers/xen/console/xencons_ring.c  2006-04-10 00:05:52.000000000 +0200
47091 @@ -0,0 +1,151 @@
47092 +/* 
47093 + * This program is free software; you can redistribute it and/or
47094 + * modify it under the terms of the GNU General Public License version 2
47095 + * as published by the Free Software Foundation; or, when distributed
47096 + * separately from the Linux kernel or incorporated into other
47097 + * software packages, subject to the following license:
47098 + * 
47099 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47100 + * of this source file (the "Software"), to deal in the Software without
47101 + * restriction, including without limitation the rights to use, copy, modify,
47102 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47103 + * and to permit persons to whom the Software is furnished to do so, subject to
47104 + * the following conditions:
47105 + * 
47106 + * The above copyright notice and this permission notice shall be included in
47107 + * all copies or substantial portions of the Software.
47108 + * 
47109 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47110 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47111 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47112 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47113 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47114 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47115 + * IN THE SOFTWARE.
47116 + */
47117 +
47118 +#include <linux/version.h>
47119 +#include <linux/module.h>
47120 +#include <linux/errno.h>
47121 +#include <linux/signal.h>
47122 +#include <linux/sched.h>
47123 +#include <linux/interrupt.h>
47124 +#include <linux/tty.h>
47125 +#include <linux/tty_flip.h>
47126 +#include <linux/serial.h>
47127 +#include <linux/major.h>
47128 +#include <linux/ptrace.h>
47129 +#include <linux/ioport.h>
47130 +#include <linux/mm.h>
47131 +#include <linux/slab.h>
47132 +
47133 +#include <asm/hypervisor.h>
47134 +#include <xen/evtchn.h>
47135 +#include <xen/xencons.h>
47136 +#include <linux/wait.h>
47137 +#include <linux/interrupt.h>
47138 +#include <linux/sched.h>
47139 +#include <linux/err.h>
47140 +#include <xen/interface/io/console.h>
47141 +
47142 +static int xencons_irq;
47143 +
47144 +static inline struct xencons_interface *xencons_interface(void)
47145 +{
47146 +       return mfn_to_virt(xen_start_info->console_mfn);
47147 +}
47148 +
47149 +static inline void notify_daemon(void)
47150 +{
47151 +       /* Use evtchn: this is called early, before irq is set up. */
47152 +       notify_remote_via_evtchn(xen_start_info->console_evtchn);
47153 +}
47154 +
47155 +int xencons_ring_send(const char *data, unsigned len)
47156 +{
47157 +       int sent = 0;
47158 +       struct xencons_interface *intf = xencons_interface();
47159 +       XENCONS_RING_IDX cons, prod;
47160 +
47161 +       cons = intf->out_cons;
47162 +       prod = intf->out_prod;
47163 +       mb();
47164 +       BUG_ON((prod - cons) > sizeof(intf->out));
47165 +
47166 +       while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
47167 +               intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
47168 +
47169 +       wmb();
47170 +       intf->out_prod = prod;
47171 +
47172 +       notify_daemon();
47173 +
47174 +       return sent;
47175 +}
47176 +
47177 +static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
47178 +{
47179 +       struct xencons_interface *intf = xencons_interface();
47180 +       XENCONS_RING_IDX cons, prod;
47181 +
47182 +       cons = intf->in_cons;
47183 +       prod = intf->in_prod;
47184 +       mb();
47185 +       BUG_ON((prod - cons) > sizeof(intf->in));
47186 +
47187 +       while (cons != prod) {
47188 +               xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
47189 +               cons++;
47190 +       }
47191 +
47192 +       mb();
47193 +       intf->in_cons = cons;
47194 +
47195 +       notify_daemon();
47196 +
47197 +       xencons_tx();
47198 +
47199 +       return IRQ_HANDLED;
47200 +}
47201 +
47202 +int xencons_ring_init(void)
47203 +{
47204 +       int err;
47205 +
47206 +       if (xencons_irq)
47207 +               unbind_from_irqhandler(xencons_irq, NULL);
47208 +       xencons_irq = 0;
47209 +
47210 +       if (!xen_start_info->console_evtchn)
47211 +               return 0;
47212 +
47213 +       err = bind_evtchn_to_irqhandler(
47214 +               xen_start_info->console_evtchn,
47215 +               handle_input, 0, "xencons", NULL);
47216 +       if (err <= 0) {
47217 +               printk(KERN_ERR "XEN console request irq failed %i\n", err);
47218 +               return err;
47219 +       }
47220 +
47221 +       xencons_irq = err;
47222 +
47223 +       /* In case we have in-flight data after save/restore... */
47224 +       notify_daemon();
47225 +
47226 +       return 0;
47227 +}
47228 +
47229 +void xencons_resume(void)
47230 +{
47231 +       (void)xencons_ring_init();
47232 +}
47233 +
47234 +/*
47235 + * Local variables:
47236 + *  c-file-style: "linux"
47237 + *  indent-tabs-mode: t
47238 + *  c-indent-level: 8
47239 + *  c-basic-offset: 8
47240 + *  tab-width: 8
47241 + * End:
47242 + */
47243 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/evtchn.c tmp-linux-2.6-xen.patch/drivers/xen/core/evtchn.c
47244 --- ref-linux-2.6.16.9/drivers/xen/core/evtchn.c        1970-01-01 01:00:00.000000000 +0100
47245 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/evtchn.c   2006-04-10 00:05:52.000000000 +0200
47246 @@ -0,0 +1,863 @@
47247 +/******************************************************************************
47248 + * evtchn.c
47249 + * 
47250 + * Communication via Xen event channels.
47251 + * 
47252 + * Copyright (c) 2002-2005, K A Fraser
47253 + * 
47254 + * This program is free software; you can redistribute it and/or
47255 + * modify it under the terms of the GNU General Public License version 2
47256 + * as published by the Free Software Foundation; or, when distributed
47257 + * separately from the Linux kernel or incorporated into other
47258 + * software packages, subject to the following license:
47259 + * 
47260 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47261 + * of this source file (the "Software"), to deal in the Software without
47262 + * restriction, including without limitation the rights to use, copy, modify,
47263 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47264 + * and to permit persons to whom the Software is furnished to do so, subject to
47265 + * the following conditions:
47266 + * 
47267 + * The above copyright notice and this permission notice shall be included in
47268 + * all copies or substantial portions of the Software.
47269 + * 
47270 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47271 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47272 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47273 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47274 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47275 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47276 + * IN THE SOFTWARE.
47277 + */
47278 +
47279 +#include <linux/config.h>
47280 +#include <linux/module.h>
47281 +#include <linux/irq.h>
47282 +#include <linux/interrupt.h>
47283 +#include <linux/sched.h>
47284 +#include <linux/kernel_stat.h>
47285 +#include <linux/version.h>
47286 +#include <asm/atomic.h>
47287 +#include <asm/system.h>
47288 +#include <asm/ptrace.h>
47289 +#include <asm/synch_bitops.h>
47290 +#include <xen/interface/event_channel.h>
47291 +#include <xen/interface/physdev.h>
47292 +#include <asm/hypervisor.h>
47293 +#include <xen/evtchn.h>
47294 +#include <linux/mc146818rtc.h> /* RTC_IRQ */
47295 +
47296 +/*
47297 + * This lock protects updates to the following mapping and reference-count
47298 + * arrays. The lock does not need to be acquired to read the mapping tables.
47299 + */
47300 +static spinlock_t irq_mapping_update_lock;
47301 +
47302 +/* IRQ <-> event-channel mappings. */
47303 +static int evtchn_to_irq[NR_EVENT_CHANNELS];
47304 +
47305 +/* Packed IRQ information: binding type, sub-type index, and event channel. */
47306 +static u32 irq_info[NR_IRQS];
47307 +
47308 +/* Binding types. */
47309 +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
47310 +
47311 +/* Constructor for packed IRQ information. */
47312 +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
47313 +{
47314 +       return ((type << 24) | (index << 16) | evtchn);
47315 +}
47316 +
47317 +/* Convenient shorthand for packed representation of an unbound IRQ. */
47318 +#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
47319 +
47320 +/*
47321 + * Accessors for packed IRQ information.
47322 + */
47323 +
47324 +static inline unsigned int evtchn_from_irq(int irq)
47325 +{
47326 +       return (u16)(irq_info[irq]);
47327 +}
47328 +
47329 +static inline unsigned int index_from_irq(int irq)
47330 +{
47331 +       return (u8)(irq_info[irq] >> 16);
47332 +}
47333 +
47334 +static inline unsigned int type_from_irq(int irq)
47335 +{
47336 +       return (u8)(irq_info[irq] >> 24);
47337 +}
47338 +
47339 +/* IRQ <-> VIRQ mapping. */
47340 +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]);
47341 +
47342 +/* IRQ <-> IPI mapping. */
47343 +#ifndef NR_IPIS
47344 +#define NR_IPIS 1
47345 +#endif
47346 +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
47347 +
47348 +/* Reference counts for bindings to IRQs. */
47349 +static int irq_bindcount[NR_IRQS];
47350 +
47351 +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
47352 +static unsigned long pirq_needs_unmask_notify[NR_PIRQS/sizeof(unsigned long)];
47353 +
47354 +#ifdef CONFIG_SMP
47355 +
47356 +static u8 cpu_evtchn[NR_EVENT_CHANNELS];
47357 +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
47358 +
47359 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
47360 +                                          unsigned int idx)
47361 +{
47362 +       return (sh->evtchn_pending[idx] &
47363 +               cpu_evtchn_mask[cpu][idx] &
47364 +               ~sh->evtchn_mask[idx]);
47365 +}
47366 +
47367 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
47368 +{
47369 +       clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
47370 +       set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
47371 +       cpu_evtchn[chn] = cpu;
47372 +}
47373 +
47374 +static void init_evtchn_cpu_bindings(void)
47375 +{
47376 +       /* By default all event channels notify CPU#0. */
47377 +       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
47378 +       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
47379 +}
47380 +
47381 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
47382 +{
47383 +       return cpu_evtchn[evtchn];
47384 +}
47385 +
47386 +#else
47387 +
47388 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
47389 +                                          unsigned int idx)
47390 +{
47391 +       return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
47392 +}
47393 +
47394 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
47395 +{
47396 +}
47397 +
47398 +static void init_evtchn_cpu_bindings(void)
47399 +{
47400 +}
47401 +
47402 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
47403 +{
47404 +       return 0;
47405 +}
47406 +
47407 +#endif
47408 +
47409 +/* Upcall to generic IRQ layer. */
47410 +#ifdef CONFIG_X86
47411 +extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
47412 +#if defined (__i386__)
47413 +static inline void exit_idle(void) {}
47414 +#define IRQ_REG orig_eax
47415 +#elif defined (__x86_64__)
47416 +#include <asm/idle.h>
47417 +#define IRQ_REG orig_rax
47418 +#endif
47419 +#define do_IRQ(irq, regs) do {         \
47420 +       (regs)->IRQ_REG = ~(irq);       \
47421 +       do_IRQ((regs));                 \
47422 +} while (0)
47423 +#endif
47424 +
47425 +/* Xen will never allocate port zero for any purpose. */
47426 +#define VALID_EVTCHN(chn)      ((chn) != 0)
47427 +
47428 +/*
47429 + * Force a proper event-channel callback from Xen after clearing the
47430 + * callback mask. We do this in a very simple manner, by making a call
47431 + * down into Xen. The pending flag will be checked by Xen on return.
47432 + */
47433 +void force_evtchn_callback(void)
47434 +{
47435 +       (void)HYPERVISOR_xen_version(0, NULL);
47436 +}
47437 +EXPORT_SYMBOL_GPL(force_evtchn_callback);
47438 +
47439 +/* NB. Interrupts are disabled on entry. */
47440 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
47441 +{
47442 +       unsigned long  l1, l2;
47443 +       unsigned int   l1i, l2i, port;
47444 +       int            irq, cpu = smp_processor_id();
47445 +       shared_info_t *s = HYPERVISOR_shared_info;
47446 +       vcpu_info_t   *vcpu_info = &s->vcpu_info[cpu];
47447 +
47448 +       vcpu_info->evtchn_upcall_pending = 0;
47449 +
47450 +       /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
47451 +       l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
47452 +       while (l1 != 0) {
47453 +               l1i = __ffs(l1);
47454 +               l1 &= ~(1UL << l1i);
47455 +
47456 +               while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
47457 +                       l2i = __ffs(l2);
47458 +
47459 +                       port = (l1i * BITS_PER_LONG) + l2i;
47460 +                       if ((irq = evtchn_to_irq[port]) != -1)
47461 +                               do_IRQ(irq, regs);
47462 +                       else {
47463 +                               exit_idle();
47464 +                               evtchn_device_upcall(port);
47465 +                       }
47466 +               }
47467 +       }
47468 +}
47469 +
47470 +static int find_unbound_irq(void)
47471 +{
47472 +       int irq;
47473 +
47474 +       for (irq = 0; irq < NR_IRQS; irq++)
47475 +               if (irq_bindcount[irq] == 0)
47476 +                       break;
47477 +
47478 +       if (irq == NR_IRQS)
47479 +               panic("No available IRQ to bind to: increase NR_IRQS!\n");
47480 +
47481 +       return irq;
47482 +}
47483 +
47484 +static int bind_evtchn_to_irq(unsigned int evtchn)
47485 +{
47486 +       int irq;
47487 +
47488 +       spin_lock(&irq_mapping_update_lock);
47489 +
47490 +       if ((irq = evtchn_to_irq[evtchn]) == -1) {
47491 +               irq = find_unbound_irq();
47492 +               evtchn_to_irq[evtchn] = irq;
47493 +               irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
47494 +       }
47495 +
47496 +       irq_bindcount[irq]++;
47497 +
47498 +       spin_unlock(&irq_mapping_update_lock);
47499 +
47500 +       return irq;
47501 +}
47502 +
47503 +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
47504 +{
47505 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_virq };
47506 +       int evtchn, irq;
47507 +
47508 +       spin_lock(&irq_mapping_update_lock);
47509 +
47510 +       if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
47511 +               op.u.bind_virq.virq = virq;
47512 +               op.u.bind_virq.vcpu = cpu;
47513 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
47514 +               evtchn = op.u.bind_virq.port;
47515 +
47516 +               irq = find_unbound_irq();
47517 +               evtchn_to_irq[evtchn] = irq;
47518 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
47519 +
47520 +               per_cpu(virq_to_irq, cpu)[virq] = irq;
47521 +
47522 +               bind_evtchn_to_cpu(evtchn, cpu);
47523 +       }
47524 +
47525 +       irq_bindcount[irq]++;
47526 +
47527 +       spin_unlock(&irq_mapping_update_lock);
47528 +
47529 +       return irq;
47530 +}
47531 +
47532 +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
47533 +{
47534 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_ipi };
47535 +       int evtchn, irq;
47536 +
47537 +       spin_lock(&irq_mapping_update_lock);
47538 +
47539 +       if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
47540 +               op.u.bind_ipi.vcpu = cpu;
47541 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
47542 +               evtchn = op.u.bind_ipi.port;
47543 +
47544 +               irq = find_unbound_irq();
47545 +               evtchn_to_irq[evtchn] = irq;
47546 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
47547 +
47548 +               per_cpu(ipi_to_irq, cpu)[ipi] = irq;
47549 +
47550 +               bind_evtchn_to_cpu(evtchn, cpu);
47551 +       }
47552 +
47553 +       irq_bindcount[irq]++;
47554 +
47555 +       spin_unlock(&irq_mapping_update_lock);
47556 +
47557 +       return irq;
47558 +}
47559 +
47560 +static void unbind_from_irq(unsigned int irq)
47561 +{
47562 +       evtchn_op_t op = { .cmd = EVTCHNOP_close };
47563 +       int evtchn = evtchn_from_irq(irq);
47564 +
47565 +       spin_lock(&irq_mapping_update_lock);
47566 +
47567 +       if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
47568 +               op.u.close.port = evtchn;
47569 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
47570 +
47571 +               switch (type_from_irq(irq)) {
47572 +               case IRQT_VIRQ:
47573 +                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
47574 +                               [index_from_irq(irq)] = -1;
47575 +                       break;
47576 +               case IRQT_IPI:
47577 +                       per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
47578 +                               [index_from_irq(irq)] = -1;
47579 +                       break;
47580 +               default:
47581 +                       break;
47582 +               }
47583 +
47584 +               /* Closed ports are implicitly re-bound to VCPU0. */
47585 +               bind_evtchn_to_cpu(evtchn, 0);
47586 +
47587 +               evtchn_to_irq[evtchn] = -1;
47588 +               irq_info[irq] = IRQ_UNBOUND;
47589 +       }
47590 +
47591 +       spin_unlock(&irq_mapping_update_lock);
47592 +}
47593 +
47594 +int bind_evtchn_to_irqhandler(
47595 +       unsigned int evtchn,
47596 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
47597 +       unsigned long irqflags,
47598 +       const char *devname,
47599 +       void *dev_id)
47600 +{
47601 +       unsigned int irq;
47602 +       int retval;
47603 +
47604 +       irq = bind_evtchn_to_irq(evtchn);
47605 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
47606 +       if (retval != 0) {
47607 +               unbind_from_irq(irq);
47608 +               return retval;
47609 +       }
47610 +
47611 +       return irq;
47612 +}
47613 +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
47614 +
47615 +int bind_virq_to_irqhandler(
47616 +       unsigned int virq,
47617 +       unsigned int cpu,
47618 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
47619 +       unsigned long irqflags,
47620 +       const char *devname,
47621 +       void *dev_id)
47622 +{
47623 +       unsigned int irq;
47624 +       int retval;
47625 +
47626 +       irq = bind_virq_to_irq(virq, cpu);
47627 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
47628 +       if (retval != 0) {
47629 +               unbind_from_irq(irq);
47630 +               return retval;
47631 +       }
47632 +
47633 +       return irq;
47634 +}
47635 +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
47636 +
47637 +int bind_ipi_to_irqhandler(
47638 +       unsigned int ipi,
47639 +       unsigned int cpu,
47640 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
47641 +       unsigned long irqflags,
47642 +       const char *devname,
47643 +       void *dev_id)
47644 +{
47645 +       unsigned int irq;
47646 +       int retval;
47647 +
47648 +       irq = bind_ipi_to_irq(ipi, cpu);
47649 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
47650 +       if (retval != 0) {
47651 +               unbind_from_irq(irq);
47652 +               return retval;
47653 +       }
47654 +
47655 +       return irq;
47656 +}
47657 +EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
47658 +
47659 +void unbind_from_irqhandler(unsigned int irq, void *dev_id)
47660 +{
47661 +       free_irq(irq, dev_id);
47662 +       unbind_from_irq(irq);
47663 +}
47664 +EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
47665 +
47666 +#ifdef CONFIG_SMP
47667 +static void do_nothing_function(void *ign)
47668 +{
47669 +}
47670 +#endif
47671 +
47672 +/* Rebind an evtchn so that it gets delivered to a specific cpu */
47673 +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
47674 +{
47675 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_vcpu };
47676 +       int evtchn;
47677 +
47678 +       spin_lock(&irq_mapping_update_lock);
47679 +
47680 +       evtchn = evtchn_from_irq(irq);
47681 +       if (!VALID_EVTCHN(evtchn)) {
47682 +               spin_unlock(&irq_mapping_update_lock);
47683 +               return;
47684 +       }
47685 +
47686 +       /* Send future instances of this interrupt to other vcpu. */
47687 +       op.u.bind_vcpu.port = evtchn;
47688 +       op.u.bind_vcpu.vcpu = tcpu;
47689 +
47690 +       /*
47691 +        * If this fails, it usually just indicates that we're dealing with a 
47692 +        * virq or IPI channel, which don't actually need to be rebound. Ignore
47693 +        * it, but don't do the xenlinux-level rebind in that case.
47694 +        */
47695 +       if (HYPERVISOR_event_channel_op(&op) >= 0)
47696 +               bind_evtchn_to_cpu(evtchn, tcpu);
47697 +
47698 +       spin_unlock(&irq_mapping_update_lock);
47699 +
47700 +       /*
47701 +        * Now send the new target processor a NOP IPI. When this returns, it
47702 +        * will check for any pending interrupts, and so service any that got 
47703 +        * delivered to the wrong processor by mistake.
47704 +        * 
47705 +        * XXX: The only time this is called with interrupts disabled is from
47706 +        * the hotplug/hotunplug path. In that case, all cpus are stopped with 
47707 +        * interrupts disabled, and the missed interrupts will be picked up
47708 +        * when they start again. This is kind of a hack.
47709 +        */
47710 +       if (!irqs_disabled())
47711 +               smp_call_function(do_nothing_function, NULL, 0, 0);
47712 +}
47713 +
47714 +
47715 +static void set_affinity_irq(unsigned irq, cpumask_t dest)
47716 +{
47717 +       unsigned tcpu = first_cpu(dest);
47718 +       rebind_irq_to_cpu(irq, tcpu);
47719 +}
47720 +
47721 +/*
47722 + * Interface to generic handling in irq.c
47723 + */
47724 +
47725 +static unsigned int startup_dynirq(unsigned int irq)
47726 +{
47727 +       int evtchn = evtchn_from_irq(irq);
47728 +
47729 +       if (VALID_EVTCHN(evtchn))
47730 +               unmask_evtchn(evtchn);
47731 +       return 0;
47732 +}
47733 +
47734 +static void shutdown_dynirq(unsigned int irq)
47735 +{
47736 +       int evtchn = evtchn_from_irq(irq);
47737 +
47738 +       if (VALID_EVTCHN(evtchn))
47739 +               mask_evtchn(evtchn);
47740 +}
47741 +
47742 +static void enable_dynirq(unsigned int irq)
47743 +{
47744 +       int evtchn = evtchn_from_irq(irq);
47745 +
47746 +       if (VALID_EVTCHN(evtchn))
47747 +               unmask_evtchn(evtchn);
47748 +}
47749 +
47750 +static void disable_dynirq(unsigned int irq)
47751 +{
47752 +       int evtchn = evtchn_from_irq(irq);
47753 +
47754 +       if (VALID_EVTCHN(evtchn))
47755 +               mask_evtchn(evtchn);
47756 +}
47757 +
47758 +static void ack_dynirq(unsigned int irq)
47759 +{
47760 +       int evtchn = evtchn_from_irq(irq);
47761 +
47762 +       if (VALID_EVTCHN(evtchn)) {
47763 +               mask_evtchn(evtchn);
47764 +               clear_evtchn(evtchn);
47765 +       }
47766 +}
47767 +
47768 +static void end_dynirq(unsigned int irq)
47769 +{
47770 +       int evtchn = evtchn_from_irq(irq);
47771 +
47772 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
47773 +               unmask_evtchn(evtchn);
47774 +}
47775 +
47776 +static struct hw_interrupt_type dynirq_type = {
47777 +       "Dynamic-irq",
47778 +       startup_dynirq,
47779 +       shutdown_dynirq,
47780 +       enable_dynirq,
47781 +       disable_dynirq,
47782 +       ack_dynirq,
47783 +       end_dynirq,
47784 +       set_affinity_irq
47785 +};
47786 +
47787 +static inline void pirq_unmask_notify(int pirq)
47788 +{
47789 +       physdev_op_t op;
47790 +       if (unlikely(test_bit(pirq, &pirq_needs_unmask_notify[0]))) {
47791 +               op.cmd = PHYSDEVOP_IRQ_UNMASK_NOTIFY;
47792 +               (void)HYPERVISOR_physdev_op(&op);
47793 +       }
47794 +}
47795 +
47796 +static inline void pirq_query_unmask(int pirq)
47797 +{
47798 +       physdev_op_t op;
47799 +       op.cmd = PHYSDEVOP_IRQ_STATUS_QUERY;
47800 +       op.u.irq_status_query.irq = pirq;
47801 +       (void)HYPERVISOR_physdev_op(&op);
47802 +       clear_bit(pirq, &pirq_needs_unmask_notify[0]);
47803 +       if (op.u.irq_status_query.flags & PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY)
47804 +               set_bit(pirq, &pirq_needs_unmask_notify[0]);
47805 +}
47806 +
47807 +/*
47808 + * On startup, if there is no action associated with the IRQ then we are
47809 + * probing. In this case we should not share with others as it will confuse us.
47810 + */
47811 +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
47812 +
47813 +static unsigned int startup_pirq(unsigned int irq)
47814 +{
47815 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_pirq };
47816 +       int evtchn = evtchn_from_irq(irq);
47817 +
47818 +       if (VALID_EVTCHN(evtchn))
47819 +               goto out;
47820 +
47821 +       op.u.bind_pirq.pirq  = irq;
47822 +       /* NB. We are happy to share unless we are probing. */
47823 +       op.u.bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
47824 +       if (HYPERVISOR_event_channel_op(&op) != 0) {
47825 +               if (!probing_irq(irq))
47826 +                       printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
47827 +                              irq);
47828 +               return 0;
47829 +       }
47830 +       evtchn = op.u.bind_pirq.port;
47831 +
47832 +       pirq_query_unmask(irq_to_pirq(irq));
47833 +
47834 +       bind_evtchn_to_cpu(evtchn, 0);
47835 +       evtchn_to_irq[evtchn] = irq;
47836 +       irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
47837 +
47838 + out:
47839 +       unmask_evtchn(evtchn);
47840 +       pirq_unmask_notify(irq_to_pirq(irq));
47841 +
47842 +       return 0;
47843 +}
47844 +
47845 +static void shutdown_pirq(unsigned int irq)
47846 +{
47847 +       evtchn_op_t op = { .cmd = EVTCHNOP_close };
47848 +       int evtchn = evtchn_from_irq(irq);
47849 +
47850 +       if (!VALID_EVTCHN(evtchn))
47851 +               return;
47852 +
47853 +       mask_evtchn(evtchn);
47854 +
47855 +       op.u.close.port = evtchn;
47856 +       BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
47857 +
47858 +       bind_evtchn_to_cpu(evtchn, 0);
47859 +       evtchn_to_irq[evtchn] = -1;
47860 +       irq_info[irq] = IRQ_UNBOUND;
47861 +}
47862 +
47863 +static void enable_pirq(unsigned int irq)
47864 +{
47865 +       int evtchn = evtchn_from_irq(irq);
47866 +
47867 +       if (VALID_EVTCHN(evtchn)) {
47868 +               unmask_evtchn(evtchn);
47869 +               pirq_unmask_notify(irq_to_pirq(irq));
47870 +       }
47871 +}
47872 +
47873 +static void disable_pirq(unsigned int irq)
47874 +{
47875 +       int evtchn = evtchn_from_irq(irq);
47876 +
47877 +       if (VALID_EVTCHN(evtchn))
47878 +               mask_evtchn(evtchn);
47879 +}
47880 +
47881 +static void ack_pirq(unsigned int irq)
47882 +{
47883 +       int evtchn = evtchn_from_irq(irq);
47884 +
47885 +       if (VALID_EVTCHN(evtchn)) {
47886 +               mask_evtchn(evtchn);
47887 +               clear_evtchn(evtchn);
47888 +       }
47889 +}
47890 +
47891 +static void end_pirq(unsigned int irq)
47892 +{
47893 +       int evtchn = evtchn_from_irq(irq);
47894 +
47895 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) {
47896 +               unmask_evtchn(evtchn);
47897 +               pirq_unmask_notify(irq_to_pirq(irq));
47898 +       }
47899 +}
47900 +
47901 +static struct hw_interrupt_type pirq_type = {
47902 +       "Phys-irq",
47903 +       startup_pirq,
47904 +       shutdown_pirq,
47905 +       enable_pirq,
47906 +       disable_pirq,
47907 +       ack_pirq,
47908 +       end_pirq,
47909 +       set_affinity_irq
47910 +};
47911 +
47912 +void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
47913 +{
47914 +       int evtchn = evtchn_from_irq(i);
47915 +       shared_info_t *s = HYPERVISOR_shared_info;
47916 +       if (!VALID_EVTCHN(evtchn))
47917 +               return;
47918 +       BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
47919 +       synch_set_bit(evtchn, &s->evtchn_pending[0]);
47920 +}
47921 +
47922 +void notify_remote_via_irq(int irq)
47923 +{
47924 +       int evtchn = evtchn_from_irq(irq);
47925 +
47926 +       if (VALID_EVTCHN(evtchn))
47927 +               notify_remote_via_evtchn(evtchn);
47928 +}
47929 +EXPORT_SYMBOL_GPL(notify_remote_via_irq);
47930 +
47931 +void mask_evtchn(int port)
47932 +{
47933 +       shared_info_t *s = HYPERVISOR_shared_info;
47934 +       synch_set_bit(port, &s->evtchn_mask[0]);
47935 +}
47936 +EXPORT_SYMBOL_GPL(mask_evtchn);
47937 +
47938 +void unmask_evtchn(int port)
47939 +{
47940 +       shared_info_t *s = HYPERVISOR_shared_info;
47941 +       unsigned int cpu = smp_processor_id();
47942 +       vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
47943 +
47944 +       /* Slow path (hypercall) if this is a non-local port. */
47945 +       if (unlikely(cpu != cpu_from_evtchn(port))) {
47946 +               evtchn_op_t op = { .cmd = EVTCHNOP_unmask,
47947 +                                  .u.unmask.port = port };
47948 +               (void)HYPERVISOR_event_channel_op(&op);
47949 +               return;
47950 +       }
47951 +
47952 +       synch_clear_bit(port, &s->evtchn_mask[0]);
47953 +
47954 +       /*
47955 +        * The following is basically the equivalent of 'hw_resend_irq'. Just
47956 +        * like a real IO-APIC we 'lose the interrupt edge' if the channel is
47957 +        * masked.
47958 +        */
47959 +       if (synch_test_bit(port, &s->evtchn_pending[0]) &&
47960 +           !synch_test_and_set_bit(port / BITS_PER_LONG,
47961 +                                   &vcpu_info->evtchn_pending_sel)) {
47962 +               vcpu_info->evtchn_upcall_pending = 1;
47963 +               if (!vcpu_info->evtchn_upcall_mask)
47964 +                       force_evtchn_callback();
47965 +       }
47966 +}
47967 +EXPORT_SYMBOL_GPL(unmask_evtchn);
47968 +
47969 +void irq_resume(void)
47970 +{
47971 +       evtchn_op_t op;
47972 +       int         cpu, pirq, virq, ipi, irq, evtchn;
47973 +
47974 +       init_evtchn_cpu_bindings();
47975 +
47976 +       /* New event-channel space is not 'live' yet. */
47977 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
47978 +               mask_evtchn(evtchn);
47979 +
47980 +       /* Check that no PIRQs are still bound. */
47981 +       for (pirq = 0; pirq < NR_PIRQS; pirq++)
47982 +               BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
47983 +
47984 +       /* Secondary CPUs must have no VIRQ or IPI bindings. */
47985 +       for (cpu = 1; cpu < NR_CPUS; cpu++) {
47986 +               for (virq = 0; virq < NR_VIRQS; virq++)
47987 +                       BUG_ON(per_cpu(virq_to_irq, cpu)[virq] != -1);
47988 +               for (ipi = 0; ipi < NR_IPIS; ipi++)
47989 +                       BUG_ON(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
47990 +       }
47991 +
47992 +       /* No IRQ <-> event-channel mappings. */
47993 +       for (irq = 0; irq < NR_IRQS; irq++)
47994 +               irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
47995 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
47996 +               evtchn_to_irq[evtchn] = -1;
47997 +
47998 +       /* Primary CPU: rebind VIRQs automatically. */
47999 +       for (virq = 0; virq < NR_VIRQS; virq++) {
48000 +               if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
48001 +                       continue;
48002 +
48003 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
48004 +
48005 +               /* Get a new binding from Xen. */
48006 +               memset(&op, 0, sizeof(op));
48007 +               op.cmd              = EVTCHNOP_bind_virq;
48008 +               op.u.bind_virq.virq = virq;
48009 +               op.u.bind_virq.vcpu = 0;
48010 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
48011 +               evtchn = op.u.bind_virq.port;
48012 +
48013 +               /* Record the new mapping. */
48014 +               evtchn_to_irq[evtchn] = irq;
48015 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
48016 +
48017 +               /* Ready for use. */
48018 +               unmask_evtchn(evtchn);
48019 +       }
48020 +
48021 +       /* Primary CPU: rebind IPIs automatically. */
48022 +       for (ipi = 0; ipi < NR_IPIS; ipi++) {
48023 +               if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
48024 +                       continue;
48025 +
48026 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
48027 +
48028 +               /* Get a new binding from Xen. */
48029 +               memset(&op, 0, sizeof(op));
48030 +               op.cmd = EVTCHNOP_bind_ipi;
48031 +               op.u.bind_ipi.vcpu = 0;
48032 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
48033 +               evtchn = op.u.bind_ipi.port;
48034 +
48035 +               /* Record the new mapping. */
48036 +               evtchn_to_irq[evtchn] = irq;
48037 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
48038 +
48039 +               /* Ready for use. */
48040 +               unmask_evtchn(evtchn);
48041 +       }
48042 +}
48043 +
48044 +void __init init_IRQ(void)
48045 +{
48046 +       int i;
48047 +       int cpu;
48048 +
48049 +       irq_ctx_init(0);
48050 +
48051 +       spin_lock_init(&irq_mapping_update_lock);
48052 +
48053 +       init_evtchn_cpu_bindings();
48054 +
48055 +       /* No VIRQ or IPI bindings. */
48056 +       for (cpu = 0; cpu < NR_CPUS; cpu++) {
48057 +               for (i = 0; i < NR_VIRQS; i++)
48058 +                       per_cpu(virq_to_irq, cpu)[i] = -1;
48059 +               for (i = 0; i < NR_IPIS; i++)
48060 +                       per_cpu(ipi_to_irq, cpu)[i] = -1;
48061 +       }
48062 +
48063 +       /* No event-channel -> IRQ mappings. */
48064 +       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
48065 +               evtchn_to_irq[i] = -1;
48066 +               mask_evtchn(i); /* No event channels are 'live' right now. */
48067 +       }
48068 +
48069 +       /* No IRQ -> event-channel mappings. */
48070 +       for (i = 0; i < NR_IRQS; i++)
48071 +               irq_info[i] = IRQ_UNBOUND;
48072 +
48073 +       /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
48074 +       for (i = 0; i < NR_DYNIRQS; i++) {
48075 +               irq_bindcount[dynirq_to_irq(i)] = 0;
48076 +
48077 +               irq_desc[dynirq_to_irq(i)].status  = IRQ_DISABLED;
48078 +               irq_desc[dynirq_to_irq(i)].action  = NULL;
48079 +               irq_desc[dynirq_to_irq(i)].depth   = 1;
48080 +               irq_desc[dynirq_to_irq(i)].handler = &dynirq_type;
48081 +       }
48082 +
48083 +       /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
48084 +       for (i = 0; i < NR_PIRQS; i++) {
48085 +               irq_bindcount[pirq_to_irq(i)] = 1;
48086 +
48087 +#ifdef RTC_IRQ
48088 +               /* If not domain 0, force our RTC driver to fail its probe. */
48089 +               if ((i == RTC_IRQ) &&
48090 +                   !(xen_start_info->flags & SIF_INITDOMAIN))
48091 +                       continue;
48092 +#endif
48093 +
48094 +               irq_desc[pirq_to_irq(i)].status  = IRQ_DISABLED;
48095 +               irq_desc[pirq_to_irq(i)].action  = NULL;
48096 +               irq_desc[pirq_to_irq(i)].depth   = 1;
48097 +               irq_desc[pirq_to_irq(i)].handler = &pirq_type;
48098 +       }
48099 +}
48100 +
48101 +/*
48102 + * Local variables:
48103 + *  c-file-style: "linux"
48104 + *  indent-tabs-mode: t
48105 + *  c-indent-level: 8
48106 + *  c-basic-offset: 8
48107 + *  tab-width: 8
48108 + * End:
48109 + */
48110 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/features.c tmp-linux-2.6-xen.patch/drivers/xen/core/features.c
48111 --- ref-linux-2.6.16.9/drivers/xen/core/features.c      1970-01-01 01:00:00.000000000 +0100
48112 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/features.c 2006-04-10 00:05:52.000000000 +0200
48113 @@ -0,0 +1,29 @@
48114 +/******************************************************************************
48115 + * features.c
48116 + *
48117 + * Xen feature flags.
48118 + *
48119 + * Copyright (c) 2006, Ian Campbell, XenSource Inc.
48120 + */
48121 +#include <linux/types.h>
48122 +#include <linux/cache.h>
48123 +#include <linux/module.h>
48124 +#include <asm/hypervisor.h>
48125 +#include <xen/features.h>
48126 +
48127 +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
48128 +EXPORT_SYMBOL_GPL(xen_features);
48129 +
48130 +void setup_xen_features(void)
48131 +{
48132 +       xen_feature_info_t fi;
48133 +       int i, j;
48134 +
48135 +       for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
48136 +               fi.submap_idx = i;
48137 +               if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
48138 +                       break;
48139 +               for (j=0; j<32; j++)
48140 +                       xen_features[i*32+j] = !!(fi.submap & 1<<j);
48141 +       }
48142 +}
48143 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/gnttab.c tmp-linux-2.6-xen.patch/drivers/xen/core/gnttab.c
48144 --- ref-linux-2.6.16.9/drivers/xen/core/gnttab.c        1970-01-01 01:00:00.000000000 +0100
48145 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/gnttab.c   2006-04-10 00:05:52.000000000 +0200
48146 @@ -0,0 +1,464 @@
48147 +/******************************************************************************
48148 + * gnttab.c
48149 + * 
48150 + * Granting foreign access to our memory reservation.
48151 + * 
48152 + * Copyright (c) 2005, Christopher Clark
48153 + * Copyright (c) 2004-2005, K A Fraser
48154 + * 
48155 + * This program is free software; you can redistribute it and/or
48156 + * modify it under the terms of the GNU General Public License version 2
48157 + * as published by the Free Software Foundation; or, when distributed
48158 + * separately from the Linux kernel or incorporated into other
48159 + * software packages, subject to the following license:
48160 + * 
48161 + * Permission is hereby granted, free of charge, to any person obtaining a copy
48162 + * of this source file (the "Software"), to deal in the Software without
48163 + * restriction, including without limitation the rights to use, copy, modify,
48164 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
48165 + * and to permit persons to whom the Software is furnished to do so, subject to
48166 + * the following conditions:
48167 + * 
48168 + * The above copyright notice and this permission notice shall be included in
48169 + * all copies or substantial portions of the Software.
48170 + * 
48171 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
48172 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48173 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48174 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48175 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
48176 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
48177 + * IN THE SOFTWARE.
48178 + */
48179 +
48180 +#include <linux/config.h>
48181 +#include <linux/module.h>
48182 +#include <linux/sched.h>
48183 +#include <linux/mm.h>
48184 +#include <linux/vmalloc.h>
48185 +#include <asm/pgtable.h>
48186 +#include <xen/interface/xen.h>
48187 +#include <asm/fixmap.h>
48188 +#include <asm/uaccess.h>
48189 +#include <xen/gnttab.h>
48190 +#include <asm/synch_bitops.h>
48191 +
48192 +#if 1
48193 +#define ASSERT(_p)                                                           \
48194 +       if (!(_p)) { printk(KERN_ALERT"Assertion '%s': line %d, file %s\n",   \
48195 +       #_p , __LINE__, __FILE__); *(int*)0=0; }
48196 +#else
48197 +#define ASSERT(_p) ((void)0)
48198 +#endif
48199 +
48200 +#define WPRINTK(fmt, args...)                          \
48201 +       printk(KERN_WARNING "xen_grant: " fmt, ##args)
48202 +
48203 +
48204 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
48205 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
48206 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
48207 +EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
48208 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
48209 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
48210 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
48211 +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
48212 +EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
48213 +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
48214 +EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
48215 +EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
48216 +EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
48217 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
48218 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
48219 +
48220 +/* External tools reserve first few grant table entries. */
48221 +#define NR_RESERVED_ENTRIES 8
48222 +
48223 +#define NR_GRANT_ENTRIES (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(grant_entry_t))
48224 +#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1)
48225 +
48226 +static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
48227 +static int gnttab_free_count;
48228 +static grant_ref_t gnttab_free_head;
48229 +static spinlock_t gnttab_list_lock = SPIN_LOCK_UNLOCKED;
48230 +
48231 +static grant_entry_t *shared = NULL;
48232 +
48233 +static struct gnttab_free_callback *gnttab_free_callback_list = NULL;
48234 +
48235 +static int
48236 +get_free_entries(int count)
48237 +{
48238 +       unsigned long flags;
48239 +       int ref;
48240 +       grant_ref_t head;
48241 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48242 +       if (gnttab_free_count < count) {
48243 +               spin_unlock_irqrestore(&gnttab_list_lock, flags);
48244 +               return -1;
48245 +       }
48246 +       ref = head = gnttab_free_head;
48247 +       gnttab_free_count -= count;
48248 +       while (count-- > 1)
48249 +               head = gnttab_list[head];
48250 +       gnttab_free_head = gnttab_list[head];
48251 +       gnttab_list[head] = GNTTAB_LIST_END;
48252 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48253 +       return ref;
48254 +}
48255 +
48256 +#define get_free_entry() get_free_entries(1)
48257 +
48258 +static void
48259 +do_free_callbacks(void)
48260 +{
48261 +       struct gnttab_free_callback *callback, *next;
48262 +
48263 +       callback = gnttab_free_callback_list;
48264 +       gnttab_free_callback_list = NULL;
48265 +
48266 +       while (callback != NULL) {
48267 +               next = callback->next;
48268 +               if (gnttab_free_count >= callback->count) {
48269 +                       callback->next = NULL;
48270 +                       callback->fn(callback->arg);
48271 +               } else {
48272 +                       callback->next = gnttab_free_callback_list;
48273 +                       gnttab_free_callback_list = callback;
48274 +               }
48275 +               callback = next;
48276 +       }
48277 +}
48278 +
48279 +static inline void
48280 +check_free_callbacks(void)
48281 +{
48282 +       if (unlikely(gnttab_free_callback_list))
48283 +               do_free_callbacks();
48284 +}
48285 +
48286 +static void
48287 +put_free_entry(grant_ref_t ref)
48288 +{
48289 +       unsigned long flags;
48290 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48291 +       gnttab_list[ref] = gnttab_free_head;
48292 +       gnttab_free_head = ref;
48293 +       gnttab_free_count++;
48294 +       check_free_callbacks();
48295 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48296 +}
48297 +
48298 +/*
48299 + * Public grant-issuing interface functions
48300 + */
48301 +
48302 +int
48303 +gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly)
48304 +{
48305 +       int ref;
48306 +
48307 +       if (unlikely((ref = get_free_entry()) == -1))
48308 +               return -ENOSPC;
48309 +
48310 +       shared[ref].frame = frame;
48311 +       shared[ref].domid = domid;
48312 +       wmb();
48313 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
48314 +
48315 +       return ref;
48316 +}
48317 +
48318 +void
48319 +gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
48320 +                               unsigned long frame, int readonly)
48321 +{
48322 +       shared[ref].frame = frame;
48323 +       shared[ref].domid = domid;
48324 +       wmb();
48325 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
48326 +}
48327 +
48328 +
48329 +int
48330 +gnttab_query_foreign_access(grant_ref_t ref)
48331 +{
48332 +       u16 nflags;
48333 +
48334 +       nflags = shared[ref].flags;
48335 +
48336 +       return (nflags & (GTF_reading|GTF_writing));
48337 +}
48338 +
48339 +int
48340 +gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
48341 +{
48342 +       u16 flags, nflags;
48343 +
48344 +       nflags = shared[ref].flags;
48345 +       do {
48346 +               if ((flags = nflags) & (GTF_reading|GTF_writing)) {
48347 +                       printk(KERN_ALERT "WARNING: g.e. still in use!\n");
48348 +                       return 0;
48349 +               }
48350 +       } while ((nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) !=
48351 +                flags);
48352 +
48353 +       return 1;
48354 +}
48355 +
48356 +void
48357 +gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page)
48358 +{
48359 +       if (gnttab_end_foreign_access_ref(ref, readonly)) {
48360 +               put_free_entry(ref);
48361 +               if (page != 0) {
48362 +                       free_page(page);
48363 +               }
48364 +       } else {
48365 +               /* XXX This needs to be fixed so that the ref and page are
48366 +                  placed on a list to be freed up later. */
48367 +               printk(KERN_WARNING
48368 +                      "WARNING: leaking g.e. and page still in use!\n");
48369 +       }
48370 +}
48371 +
48372 +int
48373 +gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
48374 +{
48375 +       int ref;
48376 +
48377 +       if (unlikely((ref = get_free_entry()) == -1))
48378 +               return -ENOSPC;
48379 +       gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
48380 +
48381 +       return ref;
48382 +}
48383 +
48384 +void
48385 +gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
48386 +                                 unsigned long pfn)
48387 +{
48388 +       shared[ref].frame = pfn;
48389 +       shared[ref].domid = domid;
48390 +       wmb();
48391 +       shared[ref].flags = GTF_accept_transfer;
48392 +}
48393 +
48394 +unsigned long
48395 +gnttab_end_foreign_transfer_ref(grant_ref_t ref)
48396 +{
48397 +       unsigned long frame;
48398 +       u16           flags;
48399 +
48400 +       /*
48401 +         * If a transfer is not even yet started, try to reclaim the grant
48402 +         * reference and return failure (== 0).
48403 +         */
48404 +       while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
48405 +               if (synch_cmpxchg(&shared[ref].flags, flags, 0) == flags)
48406 +                       return 0;
48407 +               cpu_relax();
48408 +       }
48409 +
48410 +       /* If a transfer is in progress then wait until it is completed. */
48411 +       while (!(flags & GTF_transfer_completed)) {
48412 +               flags = shared[ref].flags;
48413 +               cpu_relax();
48414 +       }
48415 +
48416 +       /* Read the frame number /after/ reading completion status. */
48417 +       rmb();
48418 +       frame = shared[ref].frame;
48419 +       BUG_ON(frame == 0);
48420 +
48421 +       return frame;
48422 +}
48423 +
48424 +unsigned long
48425 +gnttab_end_foreign_transfer(grant_ref_t ref)
48426 +{
48427 +       unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
48428 +       put_free_entry(ref);
48429 +       return frame;
48430 +}
48431 +
48432 +void
48433 +gnttab_free_grant_reference(grant_ref_t ref)
48434 +{
48435 +
48436 +       put_free_entry(ref);
48437 +}
48438 +
48439 +void
48440 +gnttab_free_grant_references(grant_ref_t head)
48441 +{
48442 +       grant_ref_t ref;
48443 +       unsigned long flags;
48444 +       int count = 1;
48445 +       if (head == GNTTAB_LIST_END)
48446 +               return;
48447 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48448 +       ref = head;
48449 +       while (gnttab_list[ref] != GNTTAB_LIST_END) {
48450 +               ref = gnttab_list[ref];
48451 +               count++;
48452 +       }
48453 +       gnttab_list[ref] = gnttab_free_head;
48454 +       gnttab_free_head = head;
48455 +       gnttab_free_count += count;
48456 +       check_free_callbacks();
48457 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48458 +}
48459 +
48460 +int
48461 +gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
48462 +{
48463 +       int h = get_free_entries(count);
48464 +
48465 +       if (h == -1)
48466 +               return -ENOSPC;
48467 +
48468 +       *head = h;
48469 +
48470 +       return 0;
48471 +}
48472 +
48473 +int
48474 +gnttab_claim_grant_reference(grant_ref_t *private_head)
48475 +{
48476 +       grant_ref_t g = *private_head;
48477 +       if (unlikely(g == GNTTAB_LIST_END))
48478 +               return -ENOSPC;
48479 +       *private_head = gnttab_list[g];
48480 +       return g;
48481 +}
48482 +
48483 +void
48484 +gnttab_release_grant_reference(grant_ref_t *private_head, grant_ref_t  release)
48485 +{
48486 +       gnttab_list[release] = *private_head;
48487 +       *private_head = release;
48488 +}
48489 +
48490 +void
48491 +gnttab_request_free_callback(struct gnttab_free_callback *callback,
48492 +                            void (*fn)(void *), void *arg, u16 count)
48493 +{
48494 +       unsigned long flags;
48495 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48496 +       if (callback->next)
48497 +               goto out;
48498 +       callback->fn = fn;
48499 +       callback->arg = arg;
48500 +       callback->count = count;
48501 +       callback->next = gnttab_free_callback_list;
48502 +       gnttab_free_callback_list = callback;
48503 +       check_free_callbacks();
48504 + out:
48505 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48506 +}
48507 +
48508 +#ifndef __ia64__
48509 +static int map_pte_fn(pte_t *pte, struct page *pmd_page,
48510 +                     unsigned long addr, void *data)
48511 +{
48512 +       unsigned long **frames = (unsigned long **)data;
48513 +
48514 +       set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
48515 +       (*frames)++;
48516 +       return 0;
48517 +}
48518 +
48519 +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
48520 +                     unsigned long addr, void *data)
48521 +{
48522 +
48523 +       set_pte_at(&init_mm, addr, pte, __pte(0));
48524 +       return 0;
48525 +}
48526 +#endif
48527 +
48528 +int
48529 +gnttab_resume(void)
48530 +{
48531 +       gnttab_setup_table_t setup;
48532 +       unsigned long frames[NR_GRANT_FRAMES];
48533 +       int rc;
48534 +#ifndef __ia64__
48535 +       void *pframes = frames;
48536 +       struct vm_struct *area;
48537 +#endif
48538 +
48539 +       setup.dom        = DOMID_SELF;
48540 +       setup.nr_frames  = NR_GRANT_FRAMES;
48541 +       setup.frame_list = frames;
48542 +
48543 +       rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
48544 +       if (rc == -ENOSYS)
48545 +               return -ENOSYS;
48546 +
48547 +       BUG_ON(rc || setup.status);
48548 +
48549 +#ifndef __ia64__
48550 +       if (shared == NULL) {
48551 +               area = get_vm_area(PAGE_SIZE * NR_GRANT_FRAMES, VM_IOREMAP);
48552 +               BUG_ON(area == NULL);
48553 +               shared = area->addr;
48554 +       }
48555 +       rc = apply_to_page_range(&init_mm, (unsigned long)shared,
48556 +                                PAGE_SIZE * NR_GRANT_FRAMES,
48557 +                                map_pte_fn, &pframes);
48558 +       BUG_ON(rc);
48559 +#else
48560 +       shared = __va(frames[0] << PAGE_SHIFT);
48561 +       printk("grant table at %p\n", shared);
48562 +#endif
48563 +
48564 +       return 0;
48565 +}
48566 +
48567 +int
48568 +gnttab_suspend(void)
48569 +{
48570 +
48571 +#ifndef __ia64__
48572 +       apply_to_page_range(&init_mm, (unsigned long)shared,
48573 +                           PAGE_SIZE * NR_GRANT_FRAMES,
48574 +                           unmap_pte_fn, NULL);
48575 +#endif
48576 +
48577 +       return 0;
48578 +}
48579 +
48580 +static int __init
48581 +gnttab_init(void)
48582 +{
48583 +       int i;
48584 +
48585 +       if (xen_init() < 0)
48586 +               return -ENODEV;
48587 +
48588 +       if (gnttab_resume() < 0)
48589 +               return -ENODEV;
48590 +
48591 +       for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
48592 +               gnttab_list[i] = i + 1;
48593 +       gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES;
48594 +       gnttab_free_head  = NR_RESERVED_ENTRIES;
48595 +
48596 +       printk("Grant table initialized\n");
48597 +       return 0;
48598 +}
48599 +
48600 +core_initcall(gnttab_init);
48601 +
48602 +/*
48603 + * Local variables:
48604 + *  c-file-style: "linux"
48605 + *  indent-tabs-mode: t
48606 + *  c-indent-level: 8
48607 + *  c-basic-offset: 8
48608 + *  tab-width: 8
48609 + * End:
48610 + */
48611 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/hypervisor_sysfs.c tmp-linux-2.6-xen.patch/drivers/xen/core/hypervisor_sysfs.c
48612 --- ref-linux-2.6.16.9/drivers/xen/core/hypervisor_sysfs.c      1970-01-01 01:00:00.000000000 +0100
48613 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/hypervisor_sysfs.c 2006-04-10 00:05:52.000000000 +0200
48614 @@ -0,0 +1,57 @@
48615 +/*
48616 + *  copyright (c) 2006 IBM Corporation
48617 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
48618 + *
48619 + *  This program is free software; you can redistribute it and/or modify
48620 + *  it under the terms of the GNU General Public License version 2 as
48621 + *  published by the Free Software Foundation.
48622 + */
48623 +
48624 +#include <linux/config.h>
48625 +#include <linux/kernel.h>
48626 +#include <linux/module.h>
48627 +#include <linux/kobject.h>
48628 +#include <xen/hypervisor_sysfs.h>
48629 +
48630 +decl_subsys(hypervisor, NULL, NULL);
48631 +
48632 +static ssize_t hyp_sysfs_show(struct kobject *kobj,
48633 +                             struct attribute *attr,
48634 +                             char *buffer)
48635 +{
48636 +       struct hyp_sysfs_attr *hyp_attr;
48637 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
48638 +       if (hyp_attr->show)
48639 +               return hyp_attr->show(hyp_attr, buffer);
48640 +       return 0;
48641 +}
48642 +
48643 +static ssize_t hyp_sysfs_store(struct kobject *kobj,
48644 +                              struct attribute *attr,
48645 +                              const char *buffer,
48646 +                              size_t len)
48647 +{
48648 +       struct hyp_sysfs_attr *hyp_attr;
48649 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
48650 +       if (hyp_attr->store)
48651 +               return hyp_attr->store(hyp_attr, buffer, len);
48652 +       return 0;
48653 +}
48654 +
48655 +struct sysfs_ops hyp_sysfs_ops = {
48656 +       .show = hyp_sysfs_show,
48657 +       .store = hyp_sysfs_store,
48658 +};
48659 +
48660 +static struct kobj_type hyp_sysfs_kobj_type = {
48661 +       .sysfs_ops = &hyp_sysfs_ops,
48662 +};
48663 +
48664 +static int __init hypervisor_subsys_init(void)
48665 +{
48666 +       hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
48667 +       return subsystem_register(&hypervisor_subsys);
48668 +}
48669 +
48670 +device_initcall(hypervisor_subsys_init);
48671 +EXPORT_SYMBOL_GPL(hypervisor_subsys);
48672 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/Makefile tmp-linux-2.6-xen.patch/drivers/xen/core/Makefile
48673 --- ref-linux-2.6.16.9/drivers/xen/core/Makefile        1970-01-01 01:00:00.000000000 +0100
48674 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/Makefile   2006-04-10 00:05:52.000000000 +0200
48675 @@ -0,0 +1,11 @@
48676 +#
48677 +# Makefile for the linux kernel.
48678 +#
48679 +
48680 +obj-y   := evtchn.o reboot.o gnttab.o features.o
48681 +
48682 +obj-$(CONFIG_PROC_FS) += xen_proc.o
48683 +obj-$(CONFIG_NET)     += skbuff.o
48684 +obj-$(CONFIG_SMP)     += smpboot.o
48685 +obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
48686 +obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
48687 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/reboot.c tmp-linux-2.6-xen.patch/drivers/xen/core/reboot.c
48688 --- ref-linux-2.6.16.9/drivers/xen/core/reboot.c        1970-01-01 01:00:00.000000000 +0100
48689 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/reboot.c   2006-04-10 00:05:52.000000000 +0200
48690 @@ -0,0 +1,381 @@
48691 +#define __KERNEL_SYSCALLS__
48692 +#include <linux/version.h>
48693 +#include <linux/kernel.h>
48694 +#include <linux/mm.h>
48695 +#include <linux/unistd.h>
48696 +#include <linux/module.h>
48697 +#include <linux/reboot.h>
48698 +#include <linux/sysrq.h>
48699 +#include <linux/stringify.h>
48700 +#include <asm/irq.h>
48701 +#include <asm/mmu_context.h>
48702 +#include <xen/evtchn.h>
48703 +#include <asm/hypervisor.h>
48704 +#include <xen/interface/dom0_ops.h>
48705 +#include <xen/xenbus.h>
48706 +#include <linux/cpu.h>
48707 +#include <linux/kthread.h>
48708 +#include <xen/gnttab.h>
48709 +#include <xen/xencons.h>
48710 +
48711 +#if defined(__i386__) || defined(__x86_64__)
48712 +/*
48713 + * Power off function, if any
48714 + */
48715 +void (*pm_power_off)(void);
48716 +EXPORT_SYMBOL(pm_power_off);
48717 +#endif
48718 +
48719 +extern void ctrl_alt_del(void);
48720 +
48721 +#define SHUTDOWN_INVALID  -1
48722 +#define SHUTDOWN_POWEROFF  0
48723 +#define SHUTDOWN_SUSPEND   2
48724 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
48725 + * report a crash, not be instructed to crash!
48726 + * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
48727 + * the distinction when we return the reason code to them.
48728 + */
48729 +#define SHUTDOWN_HALT      4
48730 +
48731 +void machine_emergency_restart(void)
48732 +{
48733 +       /* We really want to get pending console data out before we die. */
48734 +       xencons_force_flush();
48735 +       HYPERVISOR_shutdown(SHUTDOWN_reboot);
48736 +}
48737 +
48738 +void machine_restart(char * __unused)
48739 +{
48740 +       machine_emergency_restart();
48741 +}
48742 +
48743 +void machine_halt(void)
48744 +{
48745 +       machine_power_off();
48746 +}
48747 +
48748 +void machine_power_off(void)
48749 +{
48750 +       /* We really want to get pending console data out before we die. */
48751 +       xencons_force_flush();
48752 +#if defined(__i386__) || defined(__x86_64__)
48753 +       if (pm_power_off)
48754 +               pm_power_off();
48755 +#endif
48756 +       HYPERVISOR_shutdown(SHUTDOWN_poweroff);
48757 +}
48758 +
48759 +int reboot_thru_bios = 0;      /* for dmi_scan.c */
48760 +EXPORT_SYMBOL(machine_restart);
48761 +EXPORT_SYMBOL(machine_halt);
48762 +EXPORT_SYMBOL(machine_power_off);
48763 +
48764 +
48765 +/******************************************************************************
48766 + * Stop/pickle callback handling.
48767 + */
48768 +
48769 +/* Ignore multiple shutdown requests. */
48770 +static int shutting_down = SHUTDOWN_INVALID;
48771 +static void __shutdown_handler(void *unused);
48772 +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
48773 +
48774 +#ifdef CONFIG_SMP
48775 +int  smp_suspend(void);
48776 +void smp_resume(void);
48777 +#else
48778 +#define smp_suspend()  (0)
48779 +#define smp_resume()   ((void)0)
48780 +#endif
48781 +
48782 +/* Ensure we run on the idle task page tables so that we will
48783 +   switch page tables before running user space. This is needed
48784 +   on architectures with separate kernel and user page tables
48785 +   because the user page table pointer is not saved/restored. */
48786 +static void switch_idle_mm(void)
48787 +{
48788 +       struct mm_struct *mm = current->active_mm;
48789 +
48790 +       if (mm == &init_mm)
48791 +               return;
48792 +
48793 +       atomic_inc(&init_mm.mm_count);
48794 +       switch_mm(mm, &init_mm, current);
48795 +       current->active_mm = &init_mm;
48796 +       mmdrop(mm);
48797 +}
48798 +
48799 +static int __do_suspend(void *ignore)
48800 +{
48801 +       int i, j, k, fpp, err;
48802 +
48803 +       extern unsigned long max_pfn;
48804 +       extern unsigned long *pfn_to_mfn_frame_list_list;
48805 +       extern unsigned long *pfn_to_mfn_frame_list[];
48806 +
48807 +       extern void time_resume(void);
48808 +
48809 +       BUG_ON(smp_processor_id() != 0);
48810 +       BUG_ON(in_interrupt());
48811 +
48812 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
48813 +               printk(KERN_WARNING "Cannot suspend in "
48814 +                      "auto_translated_physmap mode.\n");
48815 +               return -EOPNOTSUPP;
48816 +       }
48817 +
48818 +       err = smp_suspend();
48819 +       if (err)
48820 +               return err;
48821 +
48822 +       xenbus_suspend();
48823 +
48824 +       preempt_disable();
48825 +
48826 +#ifdef __i386__
48827 +       kmem_cache_shrink(pgd_cache);
48828 +#endif
48829 +       mm_pin_all();
48830 +
48831 +       __cli();
48832 +       preempt_enable();
48833 +
48834 +       gnttab_suspend();
48835 +
48836 +       HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
48837 +       clear_fixmap(FIX_SHARED_INFO);
48838 +
48839 +       xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
48840 +       xen_start_info->console_mfn = mfn_to_pfn(xen_start_info->console_mfn);
48841 +
48842 +       /*
48843 +        * We'll stop somewhere inside this hypercall. When it returns,
48844 +        * we'll start resuming after the restore.
48845 +        */
48846 +       HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
48847 +
48848 +       shutting_down = SHUTDOWN_INVALID;
48849 +
48850 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
48851 +
48852 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
48853 +
48854 +       memset(empty_zero_page, 0, PAGE_SIZE);
48855 +
48856 +       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
48857 +               virt_to_mfn(pfn_to_mfn_frame_list_list);
48858 +
48859 +       fpp = PAGE_SIZE/sizeof(unsigned long);
48860 +       for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
48861 +               if ((j % fpp) == 0) {
48862 +                       k++;
48863 +                       pfn_to_mfn_frame_list_list[k] =
48864 +                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
48865 +                       j = 0;
48866 +               }
48867 +               pfn_to_mfn_frame_list[k][j] =
48868 +                       virt_to_mfn(&phys_to_machine_mapping[i]);
48869 +       }
48870 +       HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
48871 +
48872 +       gnttab_resume();
48873 +
48874 +       irq_resume();
48875 +
48876 +       time_resume();
48877 +
48878 +       switch_idle_mm();
48879 +
48880 +       __sti();
48881 +
48882 +       xencons_resume();
48883 +
48884 +       xenbus_resume();
48885 +
48886 +       smp_resume();
48887 +
48888 +       return err;
48889 +}
48890 +
48891 +static int shutdown_process(void *__unused)
48892 +{
48893 +       static char *envp[] = { "HOME=/", "TERM=linux",
48894 +                               "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
48895 +       static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
48896 +
48897 +       extern asmlinkage long sys_reboot(int magic1, int magic2,
48898 +                                         unsigned int cmd, void *arg);
48899 +
48900 +       if ((shutting_down == SHUTDOWN_POWEROFF) ||
48901 +           (shutting_down == SHUTDOWN_HALT)) {
48902 +               if (execve("/sbin/poweroff", poweroff_argv, envp) < 0) {
48903 +                       sys_reboot(LINUX_REBOOT_MAGIC1,
48904 +                                  LINUX_REBOOT_MAGIC2,
48905 +                                  LINUX_REBOOT_CMD_POWER_OFF,
48906 +                                  NULL);
48907 +               }
48908 +       }
48909 +
48910 +       shutting_down = SHUTDOWN_INVALID; /* could try again */
48911 +
48912 +       return 0;
48913 +}
48914 +
48915 +static int kthread_create_on_cpu(int (*f)(void *arg),
48916 +                                void *arg,
48917 +                                const char *name,
48918 +                                int cpu)
48919 +{
48920 +       struct task_struct *p;
48921 +       p = kthread_create(f, arg, name);
48922 +       if (IS_ERR(p))
48923 +               return PTR_ERR(p);
48924 +       kthread_bind(p, cpu);
48925 +       wake_up_process(p);
48926 +       return 0;
48927 +}
48928 +
48929 +static void __shutdown_handler(void *unused)
48930 +{
48931 +       int err;
48932 +
48933 +       if (shutting_down != SHUTDOWN_SUSPEND)
48934 +               err = kernel_thread(shutdown_process, NULL,
48935 +                                   CLONE_FS | CLONE_FILES);
48936 +       else
48937 +               err = kthread_create_on_cpu(__do_suspend, NULL, "suspend", 0);
48938 +
48939 +       if (err < 0) {
48940 +               printk(KERN_WARNING "Error creating shutdown process (%d): "
48941 +                      "retrying...\n", -err);
48942 +               schedule_delayed_work(&shutdown_work, HZ/2);
48943 +       }
48944 +}
48945 +
48946 +static void shutdown_handler(struct xenbus_watch *watch,
48947 +                            const char **vec, unsigned int len)
48948 +{
48949 +       char *str;
48950 +       xenbus_transaction_t xbt;
48951 +       int err;
48952 +
48953 +       if (shutting_down != SHUTDOWN_INVALID)
48954 +               return;
48955 +
48956 + again:
48957 +       err = xenbus_transaction_start(&xbt);
48958 +       if (err)
48959 +               return;
48960 +       str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
48961 +       /* Ignore read errors and empty reads. */
48962 +       if (XENBUS_IS_ERR_READ(str)) {
48963 +               xenbus_transaction_end(xbt, 1);
48964 +               return;
48965 +       }
48966 +
48967 +       xenbus_write(xbt, "control", "shutdown", "");
48968 +
48969 +       err = xenbus_transaction_end(xbt, 0);
48970 +       if (err == -EAGAIN) {
48971 +               kfree(str);
48972 +               goto again;
48973 +       }
48974 +
48975 +       if (strcmp(str, "poweroff") == 0)
48976 +               shutting_down = SHUTDOWN_POWEROFF;
48977 +       else if (strcmp(str, "reboot") == 0)
48978 +               ctrl_alt_del();
48979 +       else if (strcmp(str, "suspend") == 0)
48980 +               shutting_down = SHUTDOWN_SUSPEND;
48981 +       else if (strcmp(str, "halt") == 0)
48982 +               shutting_down = SHUTDOWN_HALT;
48983 +       else {
48984 +               printk("Ignoring shutdown request: %s\n", str);
48985 +               shutting_down = SHUTDOWN_INVALID;
48986 +       }
48987 +
48988 +       if (shutting_down != SHUTDOWN_INVALID)
48989 +               schedule_work(&shutdown_work);
48990 +
48991 +       kfree(str);
48992 +}
48993 +
48994 +static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
48995 +                         unsigned int len)
48996 +{
48997 +       char sysrq_key = '\0';
48998 +       xenbus_transaction_t xbt;
48999 +       int err;
49000 +
49001 + again:
49002 +       err = xenbus_transaction_start(&xbt);
49003 +       if (err)
49004 +               return;
49005 +       if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
49006 +               printk(KERN_ERR "Unable to read sysrq code in "
49007 +                      "control/sysrq\n");
49008 +               xenbus_transaction_end(xbt, 1);
49009 +               return;
49010 +       }
49011 +
49012 +       if (sysrq_key != '\0')
49013 +               xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
49014 +
49015 +       err = xenbus_transaction_end(xbt, 0);
49016 +       if (err == -EAGAIN)
49017 +               goto again;
49018 +
49019 +#ifdef CONFIG_MAGIC_SYSRQ
49020 +       if (sysrq_key != '\0')
49021 +               handle_sysrq(sysrq_key, NULL, NULL);
49022 +#endif
49023 +}
49024 +
49025 +static struct xenbus_watch shutdown_watch = {
49026 +       .node = "control/shutdown",
49027 +       .callback = shutdown_handler
49028 +};
49029 +
49030 +static struct xenbus_watch sysrq_watch = {
49031 +       .node ="control/sysrq",
49032 +       .callback = sysrq_handler
49033 +};
49034 +
49035 +static int setup_shutdown_watcher(struct notifier_block *notifier,
49036 +                                  unsigned long event,
49037 +                                  void *data)
49038 +{
49039 +       int err;
49040 +
49041 +       err = register_xenbus_watch(&shutdown_watch);
49042 +       if (err)
49043 +               printk(KERN_ERR "Failed to set shutdown watcher\n");
49044 +
49045 +       err = register_xenbus_watch(&sysrq_watch);
49046 +       if (err)
49047 +               printk(KERN_ERR "Failed to set sysrq watcher\n");
49048 +
49049 +       return NOTIFY_DONE;
49050 +}
49051 +
49052 +static int __init setup_shutdown_event(void)
49053 +{
49054 +       static struct notifier_block xenstore_notifier = {
49055 +               .notifier_call = setup_shutdown_watcher
49056 +       };
49057 +       register_xenstore_notifier(&xenstore_notifier);
49058 +       return 0;
49059 +}
49060 +
49061 +subsys_initcall(setup_shutdown_event);
49062 +
49063 +/*
49064 + * Local variables:
49065 + *  c-file-style: "linux"
49066 + *  indent-tabs-mode: t
49067 + *  c-indent-level: 8
49068 + *  c-basic-offset: 8
49069 + *  tab-width: 8
49070 + * End:
49071 + */
49072 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/skbuff.c tmp-linux-2.6-xen.patch/drivers/xen/core/skbuff.c
49073 --- ref-linux-2.6.16.9/drivers/xen/core/skbuff.c        1970-01-01 01:00:00.000000000 +0100
49074 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/skbuff.c   2006-04-10 00:05:52.000000000 +0200
49075 @@ -0,0 +1,144 @@
49076 +
49077 +#include <linux/config.h>
49078 +#include <linux/module.h>
49079 +#include <linux/version.h>
49080 +#include <linux/kernel.h>
49081 +#include <linux/sched.h>
49082 +#include <linux/slab.h>
49083 +#include <linux/netdevice.h>
49084 +#include <linux/inetdevice.h>
49085 +#include <linux/etherdevice.h>
49086 +#include <linux/skbuff.h>
49087 +#include <linux/init.h>
49088 +#include <asm/io.h>
49089 +#include <asm/page.h>
49090 +#include <asm/hypervisor.h>
49091 +
49092 +/* Referenced in netback.c. */
49093 +/*static*/ kmem_cache_t *skbuff_cachep;
49094 +EXPORT_SYMBOL(skbuff_cachep);
49095 +
49096 +#define MAX_SKBUFF_ORDER 4
49097 +static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
49098 +
49099 +static struct {
49100 +       int size;
49101 +       kmem_cache_t *cachep;
49102 +} skbuff_small[] = { { 512, NULL }, { 2048, NULL } };
49103 +
49104 +struct sk_buff *__alloc_skb(unsigned int length, gfp_t gfp_mask,
49105 +                           int fclone)
49106 +{
49107 +       int order, i;
49108 +       kmem_cache_t *cachep;
49109 +
49110 +       length = SKB_DATA_ALIGN(length) + sizeof(struct skb_shared_info);
49111 +
49112 +       if (length <= skbuff_small[ARRAY_SIZE(skbuff_small)-1].size) {
49113 +               for (i = 0; skbuff_small[i].size < length; i++)
49114 +                       continue;
49115 +               cachep = skbuff_small[i].cachep;
49116 +       } else {
49117 +               order = get_order(length);
49118 +               if (order > MAX_SKBUFF_ORDER) {
49119 +                       printk(KERN_ALERT "Attempt to allocate order %d "
49120 +                              "skbuff. Increase MAX_SKBUFF_ORDER.\n", order);
49121 +                       return NULL;
49122 +               }
49123 +               cachep = skbuff_order_cachep[order];
49124 +       }
49125 +
49126 +       length -= sizeof(struct skb_shared_info);
49127 +
49128 +       return alloc_skb_from_cache(cachep, length, gfp_mask, fclone);
49129 +}
49130 +
49131 +struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask)
49132 +{
49133 +       struct sk_buff *skb;
49134 +       int order;
49135 +
49136 +       length = SKB_DATA_ALIGN(length + 16);
49137 +       order = get_order(length + sizeof(struct skb_shared_info));
49138 +       if (order > MAX_SKBUFF_ORDER) {
49139 +               printk(KERN_ALERT "Attempt to allocate order %d skbuff. "
49140 +                      "Increase MAX_SKBUFF_ORDER.\n", order);
49141 +               return NULL;
49142 +       }
49143 +
49144 +       skb = alloc_skb_from_cache(
49145 +               skbuff_order_cachep[order], length, gfp_mask, 0);
49146 +       if (skb != NULL)
49147 +               skb_reserve(skb, 16);
49148 +
49149 +       return skb;
49150 +}
49151 +
49152 +static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused)
49153 +{
49154 +       int order = 0;
49155 +
49156 +       while (skbuff_order_cachep[order] != cachep)
49157 +               order++;
49158 +
49159 +       /* Do our best to allocate contiguous memory but fall back to IOMMU. */
49160 +       if (order != 0)
49161 +               (void)xen_create_contiguous_region(
49162 +                       (unsigned long)buf, order, 0);
49163 +
49164 +       scrub_pages(buf, 1 << order);
49165 +}
49166 +
49167 +static void skbuff_dtor(void *buf, kmem_cache_t *cachep, unsigned long unused)
49168 +{
49169 +       int order = 0;
49170 +
49171 +       while (skbuff_order_cachep[order] != cachep)
49172 +               order++;
49173 +
49174 +       if (order != 0)
49175 +               xen_destroy_contiguous_region((unsigned long)buf, order);
49176 +}
49177 +
49178 +static int __init skbuff_init(void)
49179 +{
49180 +       static char name[MAX_SKBUFF_ORDER + 1][20];
49181 +       static char small_name[ARRAY_SIZE(skbuff_small)][20];
49182 +       unsigned long size;
49183 +       int i, order;
49184 +
49185 +       for (i = 0; i < ARRAY_SIZE(skbuff_small); i++) {
49186 +               size = skbuff_small[i].size;
49187 +               sprintf(small_name[i], "xen-skb-%lu", size);
49188 +               /*
49189 +                * No ctor/dtor: objects do not span page boundaries, and they
49190 +                * are only used on transmit path so no need for scrubbing.
49191 +                */
49192 +               skbuff_small[i].cachep = kmem_cache_create(
49193 +                       small_name[i], size, size, 0, NULL, NULL);
49194 +       }
49195 +
49196 +       for (order = 0; order <= MAX_SKBUFF_ORDER; order++) {
49197 +               size = PAGE_SIZE << order;
49198 +               sprintf(name[order], "xen-skb-%lu", size);
49199 +               skbuff_order_cachep[order] = kmem_cache_create(
49200 +                       name[order], size, size, 0, skbuff_ctor, skbuff_dtor);
49201 +       }
49202 +
49203 +       skbuff_cachep = skbuff_order_cachep[0];
49204 +
49205 +       return 0;
49206 +}
49207 +core_initcall(skbuff_init);
49208 +
49209 +EXPORT_SYMBOL(__dev_alloc_skb);
49210 +
49211 +/*
49212 + * Local variables:
49213 + *  c-file-style: "linux"
49214 + *  indent-tabs-mode: t
49215 + *  c-indent-level: 8
49216 + *  c-basic-offset: 8
49217 + *  tab-width: 8
49218 + * End:
49219 + */
49220 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/smpboot.c tmp-linux-2.6-xen.patch/drivers/xen/core/smpboot.c
49221 --- ref-linux-2.6.16.9/drivers/xen/core/smpboot.c       1970-01-01 01:00:00.000000000 +0100
49222 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/smpboot.c  2006-04-10 00:05:52.000000000 +0200
49223 @@ -0,0 +1,581 @@
49224 +/*
49225 + *     Xen SMP booting functions
49226 + *
49227 + *     See arch/i386/kernel/smpboot.c for copyright and credits for derived
49228 + *     portions of this file.
49229 + */
49230 +
49231 +#include <linux/module.h>
49232 +#include <linux/config.h>
49233 +#include <linux/init.h>
49234 +#include <linux/kernel.h>
49235 +#include <linux/mm.h>
49236 +#include <linux/sched.h>
49237 +#include <linux/kernel_stat.h>
49238 +#include <linux/smp_lock.h>
49239 +#include <linux/irq.h>
49240 +#include <linux/bootmem.h>
49241 +#include <linux/notifier.h>
49242 +#include <linux/cpu.h>
49243 +#include <linux/percpu.h>
49244 +#include <asm/desc.h>
49245 +#include <asm/arch_hooks.h>
49246 +#include <asm/pgalloc.h>
49247 +#include <xen/evtchn.h>
49248 +#include <xen/interface/vcpu.h>
49249 +#include <xen/xenbus.h>
49250 +
49251 +#ifdef CONFIG_SMP_ALTERNATIVES
49252 +#include <asm/smp_alt.h>
49253 +#endif
49254 +
49255 +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
49256 +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
49257 +
49258 +extern void local_setup_timer(unsigned int cpu);
49259 +extern void local_teardown_timer(unsigned int cpu);
49260 +
49261 +extern void hypervisor_callback(void);
49262 +extern void failsafe_callback(void);
49263 +extern void system_call(void);
49264 +extern void smp_trap_init(trap_info_t *);
49265 +
49266 +/* Number of siblings per CPU package */
49267 +int smp_num_siblings = 1;
49268 +int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
49269 +EXPORT_SYMBOL(phys_proc_id);
49270 +int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */
49271 +EXPORT_SYMBOL(cpu_core_id);
49272 +
49273 +cpumask_t cpu_online_map;
49274 +EXPORT_SYMBOL(cpu_online_map);
49275 +cpumask_t cpu_possible_map;
49276 +EXPORT_SYMBOL(cpu_possible_map);
49277 +
49278 +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
49279 +EXPORT_SYMBOL(cpu_data);
49280 +
49281 +#ifdef CONFIG_HOTPLUG_CPU
49282 +DEFINE_PER_CPU(int, cpu_state) = { 0 };
49283 +#endif
49284 +
49285 +static DEFINE_PER_CPU(int, resched_irq);
49286 +static DEFINE_PER_CPU(int, callfunc_irq);
49287 +static char resched_name[NR_CPUS][15];
49288 +static char callfunc_name[NR_CPUS][15];
49289 +
49290 +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
49291 +
49292 +void *xquad_portio;
49293 +
49294 +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
49295 +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
49296 +EXPORT_SYMBOL(cpu_core_map);
49297 +
49298 +#if defined(__i386__)
49299 +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
49300 +EXPORT_SYMBOL(x86_cpu_to_apicid);
49301 +#elif !defined(CONFIG_X86_IO_APIC)
49302 +unsigned int maxcpus = NR_CPUS;
49303 +#endif
49304 +
49305 +/*
49306 + * Set of CPUs that remote admin software will allow us to bring online.
49307 + * Notified to us via xenbus.
49308 + */
49309 +static cpumask_t xenbus_allowed_cpumask;
49310 +
49311 +/* Set of CPUs that local admin will allow us to bring online. */
49312 +static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
49313 +
49314 +void __init prefill_possible_map(void)
49315 +{
49316 +       int i, rc;
49317 +
49318 +       if (!cpus_empty(cpu_possible_map))
49319 +               return;
49320 +
49321 +       for (i = 0; i < NR_CPUS; i++) {
49322 +               rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
49323 +               if (rc == -ENOENT)
49324 +                       break;
49325 +               cpu_set(i, cpu_possible_map);
49326 +       }
49327 +}
49328 +
49329 +void __init smp_alloc_memory(void)
49330 +{
49331 +}
49332 +
49333 +static void xen_smp_intr_init(unsigned int cpu)
49334 +{
49335 +       sprintf(resched_name[cpu], "resched%d", cpu);
49336 +       per_cpu(resched_irq, cpu) =
49337 +               bind_ipi_to_irqhandler(
49338 +                       RESCHEDULE_VECTOR,
49339 +                       cpu,
49340 +                       smp_reschedule_interrupt,
49341 +                       SA_INTERRUPT,
49342 +                       resched_name[cpu],
49343 +                       NULL);
49344 +       BUG_ON(per_cpu(resched_irq, cpu) < 0);
49345 +
49346 +       sprintf(callfunc_name[cpu], "callfunc%d", cpu);
49347 +       per_cpu(callfunc_irq, cpu) =
49348 +               bind_ipi_to_irqhandler(
49349 +                       CALL_FUNCTION_VECTOR,
49350 +                       cpu,
49351 +                       smp_call_function_interrupt,
49352 +                       SA_INTERRUPT,
49353 +                       callfunc_name[cpu],
49354 +                       NULL);
49355 +       BUG_ON(per_cpu(callfunc_irq, cpu) < 0);
49356 +
49357 +       if (cpu != 0)
49358 +               local_setup_timer(cpu);
49359 +}
49360 +
49361 +#ifdef CONFIG_HOTPLUG_CPU
49362 +static void xen_smp_intr_exit(unsigned int cpu)
49363 +{
49364 +       if (cpu != 0)
49365 +               local_teardown_timer(cpu);
49366 +
49367 +       unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
49368 +       unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
49369 +}
49370 +#endif
49371 +
49372 +static void cpu_bringup(void)
49373 +{
49374 +       cpu_init();
49375 +       touch_softlockup_watchdog();
49376 +       preempt_disable();
49377 +       local_irq_enable();
49378 +       cpu_idle();
49379 +}
49380 +
49381 +static void vcpu_prepare(int vcpu)
49382 +{
49383 +       vcpu_guest_context_t ctxt;
49384 +       struct task_struct *idle = idle_task(vcpu);
49385 +#ifdef __x86_64__
49386 +       struct desc_ptr *gdt_descr = &cpu_gdt_descr[vcpu];
49387 +#else
49388 +       struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, vcpu);
49389 +#endif
49390 +
49391 +       if (vcpu == 0)
49392 +               return;
49393 +
49394 +       memset(&ctxt, 0, sizeof(ctxt));
49395 +
49396 +       ctxt.flags = VGCF_IN_KERNEL;
49397 +       ctxt.user_regs.ds = __USER_DS;
49398 +       ctxt.user_regs.es = __USER_DS;
49399 +       ctxt.user_regs.fs = 0;
49400 +       ctxt.user_regs.gs = 0;
49401 +       ctxt.user_regs.ss = __KERNEL_DS;
49402 +       ctxt.user_regs.eip = (unsigned long)cpu_bringup;
49403 +       ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
49404 +
49405 +       memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
49406 +
49407 +       smp_trap_init(ctxt.trap_ctxt);
49408 +
49409 +       ctxt.ldt_ents = 0;
49410 +
49411 +       ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
49412 +       ctxt.gdt_ents      = gdt_descr->size / 8;
49413 +
49414 +#ifdef __i386__
49415 +       ctxt.user_regs.cs = __KERNEL_CS;
49416 +       ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
49417 +
49418 +       ctxt.kernel_ss = __KERNEL_DS;
49419 +       ctxt.kernel_sp = idle->thread.esp0;
49420 +
49421 +       ctxt.event_callback_cs     = __KERNEL_CS;
49422 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
49423 +       ctxt.failsafe_callback_cs  = __KERNEL_CS;
49424 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
49425 +
49426 +       ctxt.ctrlreg[3] = virt_to_mfn(swapper_pg_dir) << PAGE_SHIFT;
49427 +#else /* __x86_64__ */
49428 +       ctxt.user_regs.cs = __KERNEL_CS;
49429 +       ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
49430 +
49431 +       ctxt.kernel_ss = __KERNEL_DS;
49432 +       ctxt.kernel_sp = idle->thread.rsp0;
49433 +
49434 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
49435 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
49436 +       ctxt.syscall_callback_eip  = (unsigned long)system_call;
49437 +
49438 +       ctxt.ctrlreg[3] = virt_to_mfn(init_level4_pgt) << PAGE_SHIFT;
49439 +
49440 +       ctxt.gs_base_kernel = (unsigned long)(cpu_pda(vcpu));
49441 +#endif
49442 +
49443 +       BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, vcpu, &ctxt));
49444 +}
49445 +
49446 +void __init smp_prepare_cpus(unsigned int max_cpus)
49447 +{
49448 +       int cpu;
49449 +       struct task_struct *idle;
49450 +#ifdef __x86_64__
49451 +       struct desc_ptr *gdt_descr;
49452 +#else
49453 +       struct Xgt_desc_struct *gdt_descr;
49454 +#endif
49455 +
49456 +       cpu_data[0] = boot_cpu_data;
49457 +
49458 +       cpu_2_logical_apicid[0] = 0;
49459 +       x86_cpu_to_apicid[0] = 0;
49460 +
49461 +       current_thread_info()->cpu = 0;
49462 +       cpu_sibling_map[0] = cpumask_of_cpu(0);
49463 +       cpu_core_map[0]    = cpumask_of_cpu(0);
49464 +
49465 +       xen_smp_intr_init(0);
49466 +
49467 +       for_each_cpu_mask (cpu, cpu_possible_map) {
49468 +               if (cpu == 0)
49469 +                       continue;
49470 +
49471 +#ifdef __x86_64__
49472 +               gdt_descr = &cpu_gdt_descr[cpu];
49473 +#else
49474 +               gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
49475 +#endif
49476 +               gdt_descr->address = get_zeroed_page(GFP_KERNEL);
49477 +               if (unlikely(!gdt_descr->address)) {
49478 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
49479 +                       continue;
49480 +               }
49481 +               gdt_descr->size = GDT_SIZE;
49482 +               memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
49483 +               make_page_readonly(
49484 +                       (void *)gdt_descr->address,
49485 +                       XENFEAT_writable_descriptor_tables);
49486 +
49487 +               cpu_data[cpu] = boot_cpu_data;
49488 +               cpu_2_logical_apicid[cpu] = cpu;
49489 +               x86_cpu_to_apicid[cpu] = cpu;
49490 +
49491 +               idle = fork_idle(cpu);
49492 +               if (IS_ERR(idle))
49493 +                       panic("failed fork for CPU %d", cpu);
49494 +
49495 +#ifdef __x86_64__
49496 +               cpu_pda(cpu)->pcurrent = idle;
49497 +               cpu_pda(cpu)->cpunumber = cpu;
49498 +               clear_ti_thread_flag(idle->thread_info, TIF_FORK);
49499 +#endif
49500 +
49501 +               irq_ctx_init(cpu);
49502 +
49503 +#ifdef CONFIG_HOTPLUG_CPU
49504 +               if (xen_start_info->flags & SIF_INITDOMAIN)
49505 +                       cpu_set(cpu, cpu_present_map);
49506 +#else
49507 +               cpu_set(cpu, cpu_present_map);
49508 +#endif
49509 +
49510 +               vcpu_prepare(cpu);
49511 +       }
49512 +
49513 +       xenbus_allowed_cpumask = cpu_present_map;
49514 +
49515 +       /* Currently, Xen gives no dynamic NUMA/HT info. */
49516 +       for (cpu = 1; cpu < NR_CPUS; cpu++) {
49517 +               cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
49518 +               cpu_core_map[cpu]    = cpumask_of_cpu(cpu);
49519 +       }
49520 +
49521 +#ifdef CONFIG_X86_IO_APIC
49522 +       /*
49523 +        * Here we can be sure that there is an IO-APIC in the system. Let's
49524 +        * go and set it up:
49525 +        */
49526 +       if (!skip_ioapic_setup && nr_ioapics)
49527 +               setup_IO_APIC();
49528 +#endif
49529 +}
49530 +
49531 +void __devinit smp_prepare_boot_cpu(void)
49532 +{
49533 +       prefill_possible_map();
49534 +       cpu_present_map  = cpumask_of_cpu(0);
49535 +       cpu_online_map   = cpumask_of_cpu(0);
49536 +}
49537 +
49538 +static int local_cpu_hotplug_request(void)
49539 +{
49540 +       /*
49541 +        * We assume a CPU hotplug request comes from local admin if it is made
49542 +        * via a userspace process (i.e., one with a real mm_struct).
49543 +        */
49544 +       return (current->mm != NULL);
49545 +}
49546 +
49547 +#ifdef CONFIG_HOTPLUG_CPU
49548 +
49549 +/*
49550 + * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
49551 + * But do it early enough to catch critical for_each_present_cpu() loops
49552 + * in i386-specific code.
49553 + */
49554 +static int __init initialize_cpu_present_map(void)
49555 +{
49556 +       cpu_present_map = cpu_possible_map;
49557 +       return 0;
49558 +}
49559 +core_initcall(initialize_cpu_present_map);
49560 +
49561 +static void vcpu_hotplug(unsigned int cpu)
49562 +{
49563 +       int err;
49564 +       char dir[32], state[32];
49565 +
49566 +       if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
49567 +               return;
49568 +
49569 +       sprintf(dir, "cpu/%d", cpu);
49570 +       err = xenbus_scanf(XBT_NULL, dir, "availability", "%s", state);
49571 +       if (err != 1) {
49572 +               printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
49573 +               return;
49574 +       }
49575 +
49576 +       if (strcmp(state, "online") == 0) {
49577 +               cpu_set(cpu, xenbus_allowed_cpumask);
49578 +               (void)cpu_up(cpu);
49579 +       } else if (strcmp(state, "offline") == 0) {
49580 +               cpu_clear(cpu, xenbus_allowed_cpumask);
49581 +               (void)cpu_down(cpu);
49582 +       } else {
49583 +               printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
49584 +                      state, cpu);
49585 +       }
49586 +}
49587 +
49588 +static void handle_vcpu_hotplug_event(
49589 +       struct xenbus_watch *watch, const char **vec, unsigned int len)
49590 +{
49591 +       int cpu;
49592 +       char *cpustr;
49593 +       const char *node = vec[XS_WATCH_PATH];
49594 +
49595 +       if ((cpustr = strstr(node, "cpu/")) != NULL) {
49596 +               sscanf(cpustr, "cpu/%d", &cpu);
49597 +               vcpu_hotplug(cpu);
49598 +       }
49599 +}
49600 +
49601 +static int smpboot_cpu_notify(struct notifier_block *notifier,
49602 +                             unsigned long action, void *hcpu)
49603 +{
49604 +       int cpu = (long)hcpu;
49605 +
49606 +       /*
49607 +        * We do this in a callback notifier rather than __cpu_disable()
49608 +        * because local_cpu_hotplug_request() does not work in the latter
49609 +        * as it's always executed from within a stopmachine kthread.
49610 +        */
49611 +       if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
49612 +               cpu_clear(cpu, local_allowed_cpumask);
49613 +
49614 +       return NOTIFY_OK;
49615 +}
49616 +
49617 +static int setup_cpu_watcher(struct notifier_block *notifier,
49618 +                             unsigned long event, void *data)
49619 +{
49620 +       int i;
49621 +
49622 +       static struct xenbus_watch cpu_watch = {
49623 +               .node = "cpu",
49624 +               .callback = handle_vcpu_hotplug_event,
49625 +               .flags = XBWF_new_thread };
49626 +       (void)register_xenbus_watch(&cpu_watch);
49627 +
49628 +       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
49629 +               for_each_cpu(i)
49630 +                       vcpu_hotplug(i);
49631 +               printk(KERN_INFO "Brought up %ld CPUs\n",
49632 +                      (long)num_online_cpus());
49633 +       }
49634 +
49635 +       return NOTIFY_DONE;
49636 +}
49637 +
49638 +static int __init setup_vcpu_hotplug_event(void)
49639 +{
49640 +       static struct notifier_block hotplug_cpu = {
49641 +               .notifier_call = smpboot_cpu_notify };
49642 +       static struct notifier_block xsn_cpu = {
49643 +               .notifier_call = setup_cpu_watcher };
49644 +
49645 +       register_cpu_notifier(&hotplug_cpu);
49646 +       register_xenstore_notifier(&xsn_cpu);
49647 +
49648 +       return 0;
49649 +}
49650 +
49651 +arch_initcall(setup_vcpu_hotplug_event);
49652 +
49653 +int smp_suspend(void)
49654 +{
49655 +       int i, err;
49656 +
49657 +       lock_cpu_hotplug();
49658 +
49659 +       /*
49660 +        * Take all other CPUs offline. We hold the hotplug mutex to
49661 +        * avoid other processes bringing up CPUs under our feet.
49662 +        */
49663 +       while (num_online_cpus() > 1) {
49664 +               unlock_cpu_hotplug();
49665 +               for_each_online_cpu(i) {
49666 +                       if (i == 0)
49667 +                               continue;
49668 +                       err = cpu_down(i);
49669 +                       if (err) {
49670 +                               printk(KERN_CRIT "Failed to take all CPUs "
49671 +                                      "down: %d.\n", err);
49672 +                               for_each_cpu(i)
49673 +                                       vcpu_hotplug(i);
49674 +                               return err;
49675 +                       }
49676 +               }
49677 +               lock_cpu_hotplug();
49678 +       }
49679 +
49680 +       return 0;
49681 +}
49682 +
49683 +void smp_resume(void)
49684 +{
49685 +       int i;
49686 +
49687 +       for_each_cpu(i)
49688 +               vcpu_prepare(i);
49689 +
49690 +       unlock_cpu_hotplug();
49691 +
49692 +       for_each_cpu(i)
49693 +               vcpu_hotplug(i);
49694 +}
49695 +
49696 +int __cpu_disable(void)
49697 +{
49698 +       cpumask_t map = cpu_online_map;
49699 +       int cpu = smp_processor_id();
49700 +
49701 +       if (cpu == 0)
49702 +               return -EBUSY;
49703 +
49704 +       cpu_clear(cpu, map);
49705 +       fixup_irqs(map);
49706 +       cpu_clear(cpu, cpu_online_map);
49707 +
49708 +       return 0;
49709 +}
49710 +
49711 +void __cpu_die(unsigned int cpu)
49712 +{
49713 +       while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
49714 +               current->state = TASK_UNINTERRUPTIBLE;
49715 +               schedule_timeout(HZ/10);
49716 +       }
49717 +
49718 +       xen_smp_intr_exit(cpu);
49719 +
49720 +#ifdef CONFIG_SMP_ALTERNATIVES
49721 +       if (num_online_cpus() == 1)
49722 +               unprepare_for_smp();
49723 +#endif
49724 +}
49725 +
49726 +#else /* !CONFIG_HOTPLUG_CPU */
49727 +
49728 +int smp_suspend(void)
49729 +{
49730 +       if (num_online_cpus() > 1) {
49731 +               printk(KERN_WARNING "Can't suspend SMP guests "
49732 +                      "without CONFIG_HOTPLUG_CPU\n");
49733 +               return -EOPNOTSUPP;
49734 +       }
49735 +       return 0;
49736 +}
49737 +
49738 +void smp_resume(void)
49739 +{
49740 +}
49741 +
49742 +int __cpu_disable(void)
49743 +{
49744 +       return -ENOSYS;
49745 +}
49746 +
49747 +void __cpu_die(unsigned int cpu)
49748 +{
49749 +       BUG();
49750 +}
49751 +
49752 +#endif /* CONFIG_HOTPLUG_CPU */
49753 +
49754 +int __devinit __cpu_up(unsigned int cpu)
49755 +{
49756 +       int rc;
49757 +
49758 +       if (local_cpu_hotplug_request()) {
49759 +               cpu_set(cpu, local_allowed_cpumask);
49760 +               if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
49761 +                       printk("%s: attempt to bring up CPU %u disallowed by "
49762 +                              "remote admin.\n", __FUNCTION__, cpu);
49763 +                       return -EBUSY;
49764 +               }
49765 +       } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
49766 +                  !cpu_isset(cpu, xenbus_allowed_cpumask)) {
49767 +               return -EBUSY;
49768 +       }
49769 +
49770 +#ifdef CONFIG_SMP_ALTERNATIVES
49771 +       if (num_online_cpus() == 1)
49772 +               prepare_for_smp();
49773 +#endif
49774 +
49775 +       xen_smp_intr_init(cpu);
49776 +       cpu_set(cpu, cpu_online_map);
49777 +
49778 +       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
49779 +       if (rc != 0)
49780 +               BUG();
49781 +
49782 +       return 0;
49783 +}
49784 +
49785 +void __init smp_cpus_done(unsigned int max_cpus)
49786 +{
49787 +}
49788 +
49789 +#ifndef CONFIG_X86_LOCAL_APIC
49790 +int setup_profiling_timer(unsigned int multiplier)
49791 +{
49792 +       return -EINVAL;
49793 +}
49794 +#endif
49795 +
49796 +/*
49797 + * Local variables:
49798 + *  c-file-style: "linux"
49799 + *  indent-tabs-mode: t
49800 + *  c-indent-level: 8
49801 + *  c-basic-offset: 8
49802 + *  tab-width: 8
49803 + * End:
49804 + */
49805 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/xen_proc.c tmp-linux-2.6-xen.patch/drivers/xen/core/xen_proc.c
49806 --- ref-linux-2.6.16.9/drivers/xen/core/xen_proc.c      1970-01-01 01:00:00.000000000 +0100
49807 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/xen_proc.c 2006-04-10 00:05:52.000000000 +0200
49808 @@ -0,0 +1,29 @@
49809 +
49810 +#include <linux/config.h>
49811 +#include <linux/proc_fs.h>
49812 +#include <xen/xen_proc.h>
49813 +
49814 +static struct proc_dir_entry *xen_base;
49815 +
49816 +struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
49817 +{
49818 +       if ( xen_base == NULL )
49819 +               if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
49820 +                       panic("Couldn't create /proc/xen");
49821 +       return create_proc_entry(name, mode, xen_base);
49822 +}
49823 +
49824 +void remove_xen_proc_entry(const char *name)
49825 +{
49826 +       remove_proc_entry(name, xen_base);
49827 +}
49828 +
49829 +/*
49830 + * Local variables:
49831 + *  c-file-style: "linux"
49832 + *  indent-tabs-mode: t
49833 + *  c-indent-level: 8
49834 + *  c-basic-offset: 8
49835 + *  tab-width: 8
49836 + * End:
49837 + */
49838 diff -Nurp ref-linux-2.6.16.9/drivers/xen/core/xen_sysfs.c tmp-linux-2.6-xen.patch/drivers/xen/core/xen_sysfs.c
49839 --- ref-linux-2.6.16.9/drivers/xen/core/xen_sysfs.c     1970-01-01 01:00:00.000000000 +0100
49840 +++ tmp-linux-2.6-xen.patch/drivers/xen/core/xen_sysfs.c        2006-04-10 00:05:52.000000000 +0200
49841 @@ -0,0 +1,311 @@
49842 +/*
49843 + *  copyright (c) 2006 IBM Corporation
49844 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
49845 + *
49846 + *  This program is free software; you can redistribute it and/or modify
49847 + *  it under the terms of the GNU General Public License version 2 as
49848 + *  published by the Free Software Foundation.
49849 + */
49850 +
49851 +#include <linux/config.h>
49852 +#include <linux/kernel.h>
49853 +#include <linux/module.h>
49854 +#include <linux/init.h>
49855 +#include <asm/hypervisor.h>
49856 +#include <xen/features.h>
49857 +#include <xen/hypervisor_sysfs.h>
49858 +
49859 +MODULE_LICENSE("GPL");
49860 +MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
49861 +
49862 +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
49863 +{
49864 +       return sprintf(buffer, "xen\n");
49865 +}
49866 +
49867 +HYPERVISOR_ATTR_RO(type);
49868 +
49869 +static int __init xen_sysfs_type_init(void)
49870 +{
49871 +       return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
49872 +}
49873 +
49874 +static void xen_sysfs_type_destroy(void)
49875 +{
49876 +       sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
49877 +}
49878 +
49879 +/* xen version attributes */
49880 +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
49881 +{
49882 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
49883 +       if (version)
49884 +               return sprintf(buffer, "%d\n", version >> 16);
49885 +       return -ENODEV;
49886 +}
49887 +
49888 +HYPERVISOR_ATTR_RO(major);
49889 +
49890 +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
49891 +{
49892 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
49893 +       if (version)
49894 +               return sprintf(buffer, "%d\n", version & 0xff);
49895 +       return -ENODEV;
49896 +}
49897 +
49898 +HYPERVISOR_ATTR_RO(minor);
49899 +
49900 +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
49901 +{
49902 +       int ret;
49903 +       char *extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
49904 +       if (extra) {
49905 +               ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
49906 +               if (!ret)
49907 +                       return sprintf(buffer, "%s\n", extra);
49908 +               kfree(extra);
49909 +       } else
49910 +               ret = -ENOMEM;
49911 +       return ret;
49912 +}
49913 +
49914 +HYPERVISOR_ATTR_RO(extra);
49915 +
49916 +static struct attribute *version_attrs[] = {
49917 +       &major_attr.attr,
49918 +       &minor_attr.attr,
49919 +       &extra_attr.attr,
49920 +       NULL
49921 +};
49922 +
49923 +static struct attribute_group version_group = {
49924 +       .name = "version",
49925 +       .attrs = version_attrs,
49926 +};
49927 +
49928 +static int __init xen_sysfs_version_init(void)
49929 +{
49930 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj, &version_group);
49931 +}
49932 +
49933 +static void xen_sysfs_version_destroy(void)
49934 +{
49935 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
49936 +}
49937 +
49938 +/* xen compilation attributes */
49939 +
49940 +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
49941 +{
49942 +       int ret;
49943 +       struct xen_compile_info *info =
49944 +           kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
49945 +       if (info) {
49946 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
49947 +               if (!ret)
49948 +                       ret = sprintf(buffer, "%s\n", info->compiler);
49949 +               kfree(info);
49950 +       } else
49951 +               ret = -ENOMEM;
49952 +
49953 +       return ret;
49954 +}
49955 +
49956 +HYPERVISOR_ATTR_RO(compiler);
49957 +
49958 +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
49959 +{
49960 +       int ret;
49961 +       struct xen_compile_info *info;
49962 +
49963 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
49964 +       if (info) {
49965 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
49966 +               if (!ret)
49967 +                       ret = sprintf(buffer, "%s\n", info->compile_by);
49968 +               kfree(info);
49969 +       } else
49970 +               ret = -ENOMEM;
49971 +       return ret;
49972 +}
49973 +
49974 +HYPERVISOR_ATTR_RO(compiled_by);
49975 +
49976 +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
49977 +{
49978 +       int ret;
49979 +       struct xen_compile_info *info;
49980 +
49981 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
49982 +       if (info) {
49983 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
49984 +               if (!ret)
49985 +                       ret = sprintf(buffer, "%s\n", info->compile_date);
49986 +               kfree(info);
49987 +       } else
49988 +               ret = -ENOMEM;
49989 +       return ret;
49990 +}
49991 +
49992 +HYPERVISOR_ATTR_RO(compile_date);
49993 +
49994 +static struct attribute *xen_compile_attrs[] = {
49995 +       &compiler_attr.attr,
49996 +       &compiled_by_attr.attr,
49997 +       &compile_date_attr.attr,
49998 +       NULL
49999 +};
50000 +
50001 +static struct attribute_group xen_compilation_group = {
50002 +       .name = "compilation",
50003 +       .attrs = xen_compile_attrs,
50004 +};
50005 +
50006 +int __init static xen_compilation_init(void)
50007 +{
50008 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
50009 +                                 &xen_compilation_group);
50010 +}
50011 +
50012 +static void xen_compilation_destroy(void)
50013 +{
50014 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj,
50015 +                          &xen_compilation_group);
50016 +}
50017 +
50018 +/* xen properties info */
50019 +
50020 +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
50021 +{
50022 +       int ret;
50023 +       char *caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
50024 +       if (caps) {
50025 +               ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
50026 +               if (!ret)
50027 +                       ret = sprintf(buffer, "%s\n", caps);
50028 +               kfree(caps);
50029 +       } else
50030 +               ret = -ENOMEM;
50031 +       return ret;
50032 +}
50033 +
50034 +HYPERVISOR_ATTR_RO(capabilities);
50035 +
50036 +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
50037 +{
50038 +       int ret;
50039 +       char *cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
50040 +       if (cset) {
50041 +               ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
50042 +               if (!ret)
50043 +                       ret = sprintf(buffer, "%s\n", cset);
50044 +               kfree(cset);
50045 +       } else
50046 +               ret = -ENOMEM;
50047 +       return ret;
50048 +}
50049 +
50050 +HYPERVISOR_ATTR_RO(changeset);
50051 +
50052 +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
50053 +{
50054 +       int ret;
50055 +       struct xen_platform_parameters *parms =
50056 +           kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
50057 +       if (parms) {
50058 +               ret = HYPERVISOR_xen_version(XENVER_platform_parameters, parms);
50059 +               if (!ret)
50060 +                       ret = sprintf(buffer, "%lx\n", parms->virt_start);
50061 +               kfree(parms);
50062 +       } else
50063 +               ret = -ENOMEM;
50064 +       return ret;
50065 +}
50066 +
50067 +HYPERVISOR_ATTR_RO(virtual_start);
50068 +
50069 +/* eventually there will be several more features to export */
50070 +static ssize_t xen_feature_show(int index, char *buffer)
50071 +{
50072 +       int ret;
50073 +
50074 +       struct xen_feature_info *info =
50075 +           kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
50076 +       if (info) {
50077 +               info->submap_idx = index;
50078 +               ret = HYPERVISOR_xen_version(XENVER_get_features, info);
50079 +               if (!ret)
50080 +                       ret = sprintf(buffer, "%d\n", info->submap);
50081 +               kfree(info);
50082 +       } else
50083 +               ret = -ENOMEM;
50084 +       return ret;
50085 +}
50086 +
50087 +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
50088 +{
50089 +       return xen_feature_show(XENFEAT_writable_page_tables, buffer);
50090 +}
50091 +
50092 +HYPERVISOR_ATTR_RO(writable_pt);
50093 +
50094 +static struct attribute *xen_properties_attrs[] = {
50095 +       &capabilities_attr.attr,
50096 +       &changeset_attr.attr,
50097 +       &virtual_start_attr.attr,
50098 +       &writable_pt_attr.attr,
50099 +       NULL
50100 +};
50101 +
50102 +static struct attribute_group xen_properties_group = {
50103 +       .name = "properties",
50104 +       .attrs = xen_properties_attrs,
50105 +};
50106 +
50107 +static int __init xen_properties_init(void)
50108 +{
50109 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
50110 +                                 &xen_properties_group);
50111 +}
50112 +
50113 +static void xen_properties_destroy(void)
50114 +{
50115 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj, &xen_properties_group);
50116 +}
50117 +
50118 +static int __init hyper_sysfs_init(void)
50119 +{
50120 +       int ret = xen_sysfs_type_init();
50121 +       if (ret)
50122 +               goto out;
50123 +       ret = xen_sysfs_version_init();
50124 +       if (ret)
50125 +               goto version_out;
50126 +       ret = xen_compilation_init();
50127 +       if (ret)
50128 +               goto comp_out;
50129 +       ret = xen_properties_init();
50130 +       if (!ret)
50131 +               goto out;
50132 +
50133 +       xen_compilation_destroy();
50134 +comp_out:
50135 +       xen_sysfs_version_destroy();
50136 +version_out:
50137 +       xen_sysfs_type_destroy();
50138 +out:
50139 +       return ret;
50140 +}
50141 +
50142 +static void hyper_sysfs_exit(void)
50143 +{
50144 +       xen_properties_destroy();
50145 +       xen_compilation_destroy();
50146 +       xen_sysfs_version_destroy();
50147 +       xen_sysfs_type_destroy();
50148 +
50149 +}
50150 +
50151 +module_init(hyper_sysfs_init);
50152 +module_exit(hyper_sysfs_exit);
50153 diff -Nurp ref-linux-2.6.16.9/drivers/xen/evtchn/evtchn.c tmp-linux-2.6-xen.patch/drivers/xen/evtchn/evtchn.c
50154 --- ref-linux-2.6.16.9/drivers/xen/evtchn/evtchn.c      1970-01-01 01:00:00.000000000 +0100
50155 +++ tmp-linux-2.6-xen.patch/drivers/xen/evtchn/evtchn.c 2006-04-10 00:05:52.000000000 +0200
50156 @@ -0,0 +1,464 @@
50157 +/******************************************************************************
50158 + * evtchn.c
50159 + * 
50160 + * Driver for receiving and demuxing event-channel signals.
50161 + * 
50162 + * Copyright (c) 2004-2005, K A Fraser
50163 + * Multi-process extensions Copyright (c) 2004, Steven Smith
50164 + * 
50165 + * This program is free software; you can redistribute it and/or
50166 + * modify it under the terms of the GNU General Public License version 2
50167 + * as published by the Free Software Foundation; or, when distributed
50168 + * separately from the Linux kernel or incorporated into other
50169 + * software packages, subject to the following license:
50170 + * 
50171 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50172 + * of this source file (the "Software"), to deal in the Software without
50173 + * restriction, including without limitation the rights to use, copy, modify,
50174 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50175 + * and to permit persons to whom the Software is furnished to do so, subject to
50176 + * the following conditions:
50177 + * 
50178 + * The above copyright notice and this permission notice shall be included in
50179 + * all copies or substantial portions of the Software.
50180 + * 
50181 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50182 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50183 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50184 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50185 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50186 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50187 + * IN THE SOFTWARE.
50188 + */
50189 +
50190 +#include <linux/config.h>
50191 +#include <linux/module.h>
50192 +#include <linux/kernel.h>
50193 +#include <linux/sched.h>
50194 +#include <linux/slab.h>
50195 +#include <linux/string.h>
50196 +#include <linux/errno.h>
50197 +#include <linux/fs.h>
50198 +#include <linux/errno.h>
50199 +#include <linux/miscdevice.h>
50200 +#include <linux/major.h>
50201 +#include <linux/proc_fs.h>
50202 +#include <linux/stat.h>
50203 +#include <linux/poll.h>
50204 +#include <linux/irq.h>
50205 +#include <linux/init.h>
50206 +#include <linux/gfp.h>
50207 +#include <xen/evtchn.h>
50208 +#include <xen/public/evtchn.h>
50209 +
50210 +struct per_user_data {
50211 +       /* Notification ring, accessed via /dev/xen/evtchn. */
50212 +#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
50213 +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
50214 +       evtchn_port_t *ring;
50215 +       unsigned int ring_cons, ring_prod, ring_overflow;
50216 +
50217 +       /* Processes wait on this queue when ring is empty. */
50218 +       wait_queue_head_t evtchn_wait;
50219 +       struct fasync_struct *evtchn_async_queue;
50220 +};
50221 +
50222 +/* Who's bound to each port? */
50223 +static struct per_user_data *port_user[NR_EVENT_CHANNELS];
50224 +static spinlock_t port_user_lock;
50225 +
50226 +void evtchn_device_upcall(int port)
50227 +{
50228 +       struct per_user_data *u;
50229 +
50230 +       spin_lock(&port_user_lock);
50231 +
50232 +       mask_evtchn(port);
50233 +       clear_evtchn(port);
50234 +
50235 +       if ((u = port_user[port]) != NULL) {
50236 +               if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
50237 +                       u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
50238 +                       if (u->ring_cons == u->ring_prod++) {
50239 +                               wake_up_interruptible(&u->evtchn_wait);
50240 +                               kill_fasync(&u->evtchn_async_queue,
50241 +                                           SIGIO, POLL_IN);
50242 +                       }
50243 +               } else {
50244 +                       u->ring_overflow = 1;
50245 +               }
50246 +       }
50247 +
50248 +       spin_unlock(&port_user_lock);
50249 +}
50250 +
50251 +static ssize_t evtchn_read(struct file *file, char __user *buf,
50252 +                           size_t count, loff_t *ppos)
50253 +{
50254 +       int rc;
50255 +       unsigned int c, p, bytes1 = 0, bytes2 = 0;
50256 +       struct per_user_data *u = file->private_data;
50257 +
50258 +       /* Whole number of ports. */
50259 +       count &= ~(sizeof(evtchn_port_t)-1);
50260 +
50261 +       if (count == 0)
50262 +               return 0;
50263 +
50264 +       if (count > PAGE_SIZE)
50265 +               count = PAGE_SIZE;
50266 +
50267 +       for (;;) {
50268 +               if (u->ring_overflow)
50269 +                       return -EFBIG;
50270 +
50271 +               if ((c = u->ring_cons) != (p = u->ring_prod))
50272 +                       break;
50273 +
50274 +               if (file->f_flags & O_NONBLOCK)
50275 +                       return -EAGAIN;
50276 +
50277 +               rc = wait_event_interruptible(
50278 +                       u->evtchn_wait, u->ring_cons != u->ring_prod);
50279 +               if (rc)
50280 +                       return rc;
50281 +       }
50282 +
50283 +       /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
50284 +       if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
50285 +               bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
50286 +                       sizeof(evtchn_port_t);
50287 +               bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
50288 +       } else {
50289 +               bytes1 = (p - c) * sizeof(evtchn_port_t);
50290 +               bytes2 = 0;
50291 +       }
50292 +
50293 +       /* Truncate chunks according to caller's maximum byte count. */
50294 +       if (bytes1 > count) {
50295 +               bytes1 = count;
50296 +               bytes2 = 0;
50297 +       } else if ((bytes1 + bytes2) > count) {
50298 +               bytes2 = count - bytes1;
50299 +       }
50300 +
50301 +       if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
50302 +           ((bytes2 != 0) &&
50303 +            copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
50304 +               return -EFAULT;
50305 +
50306 +       u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
50307 +
50308 +       return bytes1 + bytes2;
50309 +}
50310 +
50311 +static ssize_t evtchn_write(struct file *file, const char __user *buf,
50312 +                            size_t count, loff_t *ppos)
50313 +{
50314 +       int  rc, i;
50315 +       evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
50316 +       struct per_user_data *u = file->private_data;
50317 +
50318 +       if (kbuf == NULL)
50319 +               return -ENOMEM;
50320 +
50321 +       /* Whole number of ports. */
50322 +       count &= ~(sizeof(evtchn_port_t)-1);
50323 +
50324 +       if (count == 0) {
50325 +               rc = 0;
50326 +               goto out;
50327 +       }
50328 +
50329 +       if (count > PAGE_SIZE)
50330 +               count = PAGE_SIZE;
50331 +
50332 +       if (copy_from_user(kbuf, buf, count) != 0) {
50333 +               rc = -EFAULT;
50334 +               goto out;
50335 +       }
50336 +
50337 +       spin_lock_irq(&port_user_lock);
50338 +       for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
50339 +               if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
50340 +                       unmask_evtchn(kbuf[i]);
50341 +       spin_unlock_irq(&port_user_lock);
50342 +
50343 +       rc = count;
50344 +
50345 + out:
50346 +       free_page((unsigned long)kbuf);
50347 +       return rc;
50348 +}
50349 +
50350 +static void evtchn_bind_to_user(struct per_user_data *u, int port)
50351 +{
50352 +       spin_lock_irq(&port_user_lock);
50353 +       BUG_ON(port_user[port] != NULL);
50354 +       port_user[port] = u;
50355 +       unmask_evtchn(port);
50356 +       spin_unlock_irq(&port_user_lock);
50357 +}
50358 +
50359 +static int evtchn_ioctl(struct inode *inode, struct file *file,
50360 +                        unsigned int cmd, unsigned long arg)
50361 +{
50362 +       int rc;
50363 +       struct per_user_data *u = file->private_data;
50364 +       void __user *uarg = (void __user *) arg;
50365 +       evtchn_op_t op = { 0 };
50366 +
50367 +       switch (cmd) {
50368 +       case IOCTL_EVTCHN_BIND_VIRQ: {
50369 +               struct ioctl_evtchn_bind_virq bind;
50370 +
50371 +               rc = -EFAULT;
50372 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
50373 +                       break;
50374 +
50375 +               op.cmd = EVTCHNOP_bind_virq;
50376 +               op.u.bind_virq.virq = bind.virq;
50377 +               op.u.bind_virq.vcpu = 0;
50378 +               rc = HYPERVISOR_event_channel_op(&op);
50379 +               if (rc != 0)
50380 +                       break;
50381 +
50382 +               rc = op.u.bind_virq.port;
50383 +               evtchn_bind_to_user(u, rc);
50384 +               break;
50385 +       }
50386 +
50387 +       case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
50388 +               struct ioctl_evtchn_bind_interdomain bind;
50389 +
50390 +               rc = -EFAULT;
50391 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
50392 +                       break;
50393 +
50394 +               op.cmd = EVTCHNOP_bind_interdomain;
50395 +               op.u.bind_interdomain.remote_dom  = bind.remote_domain;
50396 +               op.u.bind_interdomain.remote_port = bind.remote_port;
50397 +               rc = HYPERVISOR_event_channel_op(&op);
50398 +               if (rc != 0)
50399 +                       break;
50400 +
50401 +               rc = op.u.bind_interdomain.local_port;
50402 +               evtchn_bind_to_user(u, rc);
50403 +               break;
50404 +       }
50405 +
50406 +       case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
50407 +               struct ioctl_evtchn_bind_unbound_port bind;
50408 +
50409 +               rc = -EFAULT;
50410 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
50411 +                       break;
50412 +
50413 +               op.cmd = EVTCHNOP_alloc_unbound;
50414 +               op.u.alloc_unbound.dom        = DOMID_SELF;
50415 +               op.u.alloc_unbound.remote_dom = bind.remote_domain;
50416 +               rc = HYPERVISOR_event_channel_op(&op);
50417 +               if (rc != 0)
50418 +                       break;
50419 +
50420 +               rc = op.u.alloc_unbound.port;
50421 +               evtchn_bind_to_user(u, rc);
50422 +               break;
50423 +       }
50424 +
50425 +       case IOCTL_EVTCHN_UNBIND: {
50426 +               struct ioctl_evtchn_unbind unbind;
50427 +               int ret;
50428 +
50429 +               rc = -EFAULT;
50430 +               if (copy_from_user(&unbind, uarg, sizeof(unbind)))
50431 +                       break;
50432 +
50433 +               rc = -EINVAL;
50434 +               if (unbind.port >= NR_EVENT_CHANNELS)
50435 +                       break;
50436 +
50437 +               spin_lock_irq(&port_user_lock);
50438 +    
50439 +               rc = -ENOTCONN;
50440 +               if (port_user[unbind.port] != u) {
50441 +                       spin_unlock_irq(&port_user_lock);
50442 +                       break;
50443 +               }
50444 +
50445 +               port_user[unbind.port] = NULL;
50446 +               mask_evtchn(unbind.port);
50447 +
50448 +               spin_unlock_irq(&port_user_lock);
50449 +
50450 +               op.cmd = EVTCHNOP_close;
50451 +               op.u.close.port = unbind.port;
50452 +               ret = HYPERVISOR_event_channel_op(&op);
50453 +               BUG_ON(ret);
50454 +
50455 +               rc = 0;
50456 +               break;
50457 +       }
50458 +
50459 +       case IOCTL_EVTCHN_NOTIFY: {
50460 +               struct ioctl_evtchn_notify notify;
50461 +
50462 +               rc = -EFAULT;
50463 +               if (copy_from_user(&notify, uarg, sizeof(notify)))
50464 +                       break;
50465 +
50466 +               if (notify.port >= NR_EVENT_CHANNELS) {
50467 +                       rc = -EINVAL;
50468 +               } else if (port_user[notify.port] != u) {
50469 +                       rc = -ENOTCONN;
50470 +               } else {
50471 +                       notify_remote_via_evtchn(notify.port);
50472 +                       rc = 0;
50473 +               }
50474 +               break;
50475 +       }
50476 +
50477 +       case IOCTL_EVTCHN_RESET: {
50478 +               /* Initialise the ring to empty. Clear errors. */
50479 +               spin_lock_irq(&port_user_lock);
50480 +               u->ring_cons = u->ring_prod = u->ring_overflow = 0;
50481 +               spin_unlock_irq(&port_user_lock);
50482 +               rc = 0;
50483 +               break;
50484 +       }
50485 +
50486 +       default:
50487 +               rc = -ENOSYS;
50488 +               break;
50489 +       }
50490 +
50491 +       return rc;
50492 +}
50493 +
50494 +static unsigned int evtchn_poll(struct file *file, poll_table *wait)
50495 +{
50496 +       unsigned int mask = POLLOUT | POLLWRNORM;
50497 +       struct per_user_data *u = file->private_data;
50498 +
50499 +       poll_wait(file, &u->evtchn_wait, wait);
50500 +       if (u->ring_cons != u->ring_prod)
50501 +               mask |= POLLIN | POLLRDNORM;
50502 +       if (u->ring_overflow)
50503 +               mask = POLLERR;
50504 +       return mask;
50505 +}
50506 +
50507 +static int evtchn_fasync(int fd, struct file *filp, int on)
50508 +{
50509 +       struct per_user_data *u = filp->private_data;
50510 +       return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
50511 +}
50512 +
50513 +static int evtchn_open(struct inode *inode, struct file *filp)
50514 +{
50515 +       struct per_user_data *u;
50516 +
50517 +       if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL)
50518 +               return -ENOMEM;
50519 +
50520 +       memset(u, 0, sizeof(*u));
50521 +       init_waitqueue_head(&u->evtchn_wait);
50522 +
50523 +       u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
50524 +       if (u->ring == NULL) {
50525 +               kfree(u);
50526 +               return -ENOMEM;
50527 +       }
50528 +
50529 +       filp->private_data = u;
50530 +
50531 +       return 0;
50532 +}
50533 +
50534 +static int evtchn_release(struct inode *inode, struct file *filp)
50535 +{
50536 +       int i;
50537 +       struct per_user_data *u = filp->private_data;
50538 +       evtchn_op_t op = { 0 };
50539 +
50540 +       spin_lock_irq(&port_user_lock);
50541 +
50542 +       free_page((unsigned long)u->ring);
50543 +
50544 +       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
50545 +               int ret;
50546 +               if (port_user[i] != u)
50547 +                       continue;
50548 +
50549 +               port_user[i] = NULL;
50550 +               mask_evtchn(i);
50551 +
50552 +               op.cmd = EVTCHNOP_close;
50553 +               op.u.close.port = i;
50554 +               ret = HYPERVISOR_event_channel_op(&op);
50555 +               BUG_ON(ret);
50556 +       }
50557 +
50558 +       spin_unlock_irq(&port_user_lock);
50559 +
50560 +       kfree(u);
50561 +
50562 +       return 0;
50563 +}
50564 +
50565 +static struct file_operations evtchn_fops = {
50566 +       .owner   = THIS_MODULE,
50567 +       .read    = evtchn_read,
50568 +       .write   = evtchn_write,
50569 +       .ioctl   = evtchn_ioctl,
50570 +       .poll    = evtchn_poll,
50571 +       .fasync  = evtchn_fasync,
50572 +       .open    = evtchn_open,
50573 +       .release = evtchn_release,
50574 +};
50575 +
50576 +static struct miscdevice evtchn_miscdev = {
50577 +       .minor        = EVTCHN_MINOR,
50578 +       .name         = "evtchn",
50579 +       .fops         = &evtchn_fops,
50580 +       .devfs_name   = "misc/evtchn",
50581 +};
50582 +
50583 +static int __init evtchn_init(void)
50584 +{
50585 +       int err;
50586 +
50587 +       spin_lock_init(&port_user_lock);
50588 +       memset(port_user, 0, sizeof(port_user));
50589 +
50590 +       /* Create '/dev/misc/evtchn'. */
50591 +       err = misc_register(&evtchn_miscdev);
50592 +       if (err != 0) {
50593 +               printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
50594 +               return err;
50595 +       }
50596 +
50597 +       printk("Event-channel device installed.\n");
50598 +
50599 +       return 0;
50600 +}
50601 +
50602 +static void evtchn_cleanup(void)
50603 +{
50604 +       misc_deregister(&evtchn_miscdev);
50605 +}
50606 +
50607 +module_init(evtchn_init);
50608 +module_exit(evtchn_cleanup);
50609 +
50610 +MODULE_LICENSE("Dual BSD/GPL");
50611 +
50612 +/*
50613 + * Local variables:
50614 + *  c-file-style: "linux"
50615 + *  indent-tabs-mode: t
50616 + *  c-indent-level: 8
50617 + *  c-basic-offset: 8
50618 + *  tab-width: 8
50619 + * End:
50620 + */
50621 diff -Nurp ref-linux-2.6.16.9/drivers/xen/evtchn/Makefile tmp-linux-2.6-xen.patch/drivers/xen/evtchn/Makefile
50622 --- ref-linux-2.6.16.9/drivers/xen/evtchn/Makefile      1970-01-01 01:00:00.000000000 +0100
50623 +++ tmp-linux-2.6-xen.patch/drivers/xen/evtchn/Makefile 2006-04-10 00:05:52.000000000 +0200
50624 @@ -0,0 +1,2 @@
50625 +
50626 +obj-y  := evtchn.o
50627 diff -Nurp ref-linux-2.6.16.9/drivers/xen/Kconfig tmp-linux-2.6-xen.patch/drivers/xen/Kconfig
50628 --- ref-linux-2.6.16.9/drivers/xen/Kconfig      1970-01-01 01:00:00.000000000 +0100
50629 +++ tmp-linux-2.6-xen.patch/drivers/xen/Kconfig 2006-04-10 00:05:52.000000000 +0200
50630 @@ -0,0 +1,212 @@
50631 +#
50632 +# This Kconfig describe xen options
50633 +#
50634 +
50635 +mainmenu "Xen Configuration"
50636 +
50637 +config XEN
50638 +       bool
50639 +       default y if X86_XEN || X86_64_XEN
50640 +       help
50641 +         This is the Linux Xen port.
50642 +
50643 +if XEN
50644 +config XEN_INTERFACE_VERSION
50645 +       hex
50646 +       default 0x00030101
50647 +
50648 +menu "XEN"
50649 +
50650 +config XEN_PRIVILEGED_GUEST
50651 +       bool "Privileged Guest (domain 0)"
50652 +       depends XEN
50653 +       default n
50654 +       help
50655 +         Support for privileged operation (domain 0)
50656 +
50657 +config XEN_UNPRIVILEGED_GUEST
50658 +       bool
50659 +       default !XEN_PRIVILEGED_GUEST
50660 +
50661 +config XEN_PCIDEV_BACKEND
50662 +       tristate "PCI device backend driver"
50663 +       depends PCI
50664 +       default XEN_PRIVILEGED_GUEST
50665 +       help
50666 +         The PCI device backend driver allows the kernel to export arbitrary
50667 +         PCI devices to other guests. If you select this to be a module, you
50668 +         will need to make sure no other driver has bound to the device(s)
50669 +         you want to make visible to other guests.
50670 +
50671 +choice
50672 +       prompt "PCI Backend Mode"
50673 +       depends on XEN_PCIDEV_BACKEND
50674 +       default XEN_PCIDEV_BACKEND_VPCI
50675 +
50676 +config XEN_PCIDEV_BACKEND_VPCI
50677 +       bool "Virtual PCI"
50678 +       ---help---
50679 +         This PCI Backend hides the true PCI topology and makes the frontend
50680 +         think there is a single PCI bus with only the exported devices on it.
50681 +         For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
50682 +         second device at 02:1a.0 will be re-assigned to 00:01.0.
50683 +
50684 +config XEN_PCIDEV_BACKEND_PASS
50685 +       bool "Passthrough"
50686 +       ---help---
50687 +         This PCI Backend provides a real view of the PCI topology to the
50688 +         frontend (for example, a device at 06:01.b will still appear at
50689 +         06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
50690 +         PCI devices to its driver domains. This may be required for drivers
50691 +         which depend on finding their hardward in certain bus/slot
50692 +         locations.
50693 +
50694 +endchoice
50695 +
50696 +config XEN_PCIDEV_BE_DEBUG
50697 +       bool "PCI Backend Debugging"
50698 +       depends on XEN_PCIDEV_BACKEND
50699 +       default n
50700 +
50701 +config XEN_BLKDEV_BACKEND
50702 +       tristate "Block-device backend driver"
50703 +       default y
50704 +       help
50705 +         The block-device backend driver allows the kernel to export its
50706 +         block devices to other guests via a high-performance shared-memory
50707 +         interface.
50708 +
50709 +config XEN_BLKDEV_TAP_BE
50710 +        tristate "Block Tap support for backend driver (DANGEROUS)"
50711 +        depends on XEN_BLKDEV_BACKEND
50712 +        default n
50713 +        help
50714 +          If you intend to use the block tap driver, the backend domain will
50715 +          not know the domain id of the real frontend, and so will not be able
50716 +          to map its data pages.  This modifies the backend to attempt to map
50717 +          from both the tap domain and the real frontend.  This presents a
50718 +          security risk, and so should ONLY be used for development
50719 +          with the blktap.  This option will be removed as the block drivers are
50720 +          modified to use grant tables.
50721 +
50722 +config XEN_NETDEV_BACKEND
50723 +       tristate "Network-device backend driver"
50724 +       default y
50725 +       help
50726 +         The network-device backend driver allows the kernel to export its
50727 +         network devices to other guests via a high-performance shared-memory
50728 +         interface.
50729 +
50730 +config XEN_NETDEV_PIPELINED_TRANSMITTER
50731 +       bool "Pipelined transmitter (DANGEROUS)"
50732 +       depends on XEN_NETDEV_BACKEND
50733 +       default n
50734 +       help
50735 +         If the net backend is a dumb domain, such as a transparent Ethernet
50736 +         bridge with no local IP interface, it is safe to say Y here to get
50737 +         slightly lower network overhead.
50738 +         If the backend has a local IP interface; or may be doing smart things
50739 +         like reassembling packets to perform firewall filtering; or if you
50740 +         are unsure; or if you experience network hangs when this option is
50741 +         enabled; then you must say N here.
50742 +
50743 +config XEN_NETDEV_LOOPBACK
50744 +       tristate "Network-device loopback driver"
50745 +       depends on XEN_NETDEV_BACKEND
50746 +       default y
50747 +       help
50748 +         A two-interface loopback device to emulate a local netfront-netback
50749 +         connection.
50750 +
50751 +config XEN_TPMDEV_BACKEND
50752 +       tristate "TPM-device backend driver"
50753 +       default n
50754 +       help
50755 +         The TPM-device backend driver
50756 +
50757 +config XEN_TPMDEV_CLOSE_IF_VTPM_FAILS
50758 +       bool "TPM backend closes upon vTPM failure"
50759 +       depends on XEN_TPMDEV_BACKEND
50760 +       default n
50761 +       help
50762 +         The TPM backend closes the channel if the vTPM in userspace indicates
50763 +         a failure. The corresponding domain's channel will be closed.
50764 +         Say Y if you want this feature.
50765 +
50766 +config XEN_BLKDEV_FRONTEND
50767 +       tristate "Block-device frontend driver"
50768 +       depends on XEN
50769 +       default y
50770 +       help
50771 +         The block-device frontend driver allows the kernel to access block
50772 +         devices mounted within another guest OS. Unless you are building a
50773 +         dedicated device-driver domain, or your master control domain
50774 +         (domain 0), then you almost certainly want to say Y here.
50775 +
50776 +config XEN_NETDEV_FRONTEND
50777 +       tristate "Network-device frontend driver"
50778 +       depends on XEN
50779 +       default y
50780 +       help
50781 +         The network-device frontend driver allows the kernel to access
50782 +         network interfaces within another guest OS. Unless you are building a
50783 +         dedicated device-driver domain, or your master control domain
50784 +         (domain 0), then you almost certainly want to say Y here.
50785 +
50786 +config XEN_BLKDEV_TAP
50787 +       tristate "Block device tap driver"
50788 +       default n
50789 +       help
50790 +         This driver allows a VM to interact on block device channels
50791 +         to other VMs.  Block messages may be passed through or redirected
50792 +         to a character device, allowing device prototyping in application
50793 +         space.  Odds are that you want to say N here.
50794 +
50795 +config XEN_TPMDEV_FRONTEND
50796 +       tristate "TPM-device frontend driver"
50797 +       default n
50798 +       select TCG_TPM
50799 +       select TCG_XEN
50800 +       help
50801 +         The TPM-device frontend driver.
50802 +
50803 +config XEN_SCRUB_PAGES
50804 +       bool "Scrub memory before freeing it to Xen"
50805 +       default y
50806 +       help
50807 +         Erase memory contents before freeing it back to Xen's global
50808 +         pool. This ensures that any secrets contained within that
50809 +         memory (e.g., private keys) cannot be found by other guests that
50810 +         may be running on the machine. Most people will want to say Y here.
50811 +         If security is not a concern then you may increase performance by
50812 +         saying N.
50813 +
50814 +config XEN_DISABLE_SERIAL
50815 +       bool "Disable serial port drivers"
50816 +       default y
50817 +       help
50818 +         Disable serial port drivers, allowing the Xen console driver
50819 +         to provide a serial console at ttyS0.
50820 +
50821 +config XEN_SYSFS
50822 +       tristate "Export Xen attributes in sysfs"
50823 +       depends on SYSFS
50824 +       default y
50825 +       help
50826 +               Xen hypervisor attributes will show up under /sys/hypervisor/.
50827 +
50828 +endmenu
50829 +
50830 +config HAVE_ARCH_ALLOC_SKB
50831 +       bool
50832 +       default y
50833 +
50834 +config HAVE_ARCH_DEV_ALLOC_SKB
50835 +       bool
50836 +       default y
50837 +
50838 +config NO_IDLE_HZ
50839 +       bool
50840 +       default y
50841 +
50842 +endif
50843 diff -Nurp ref-linux-2.6.16.9/drivers/xen/Makefile tmp-linux-2.6-xen.patch/drivers/xen/Makefile
50844 --- ref-linux-2.6.16.9/drivers/xen/Makefile     1970-01-01 01:00:00.000000000 +0100
50845 +++ tmp-linux-2.6-xen.patch/drivers/xen/Makefile        2006-04-10 00:05:52.000000000 +0200
50846 @@ -0,0 +1,22 @@
50847 +
50848 +obj-y  += net_driver_util.o
50849 +obj-y  += util.o
50850 +
50851 +obj-y  += core/
50852 +obj-y  += char/
50853 +obj-y  += console/
50854 +obj-y  += evtchn/
50855 +obj-y  += balloon/
50856 +obj-y  += privcmd/
50857 +obj-y  += xenbus/
50858 +
50859 +obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += blkback/
50860 +obj-$(CONFIG_XEN_NETDEV_BACKEND)       += netback/
50861 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmback/
50862 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += blkfront/
50863 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      += netfront/
50864 +obj-$(CONFIG_XEN_BLKDEV_TAP)           += blktap/
50865 +obj-$(CONFIG_XEN_TPMDEV_FRONTEND)      += tpmfront/
50866 +obj-$(CONFIG_XEN_PCIDEV_BACKEND)       += pciback/
50867 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront/
50868 +
50869 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netback/common.h tmp-linux-2.6-xen.patch/drivers/xen/netback/common.h
50870 --- ref-linux-2.6.16.9/drivers/xen/netback/common.h     1970-01-01 01:00:00.000000000 +0100
50871 +++ tmp-linux-2.6-xen.patch/drivers/xen/netback/common.h        2006-04-10 00:05:52.000000000 +0200
50872 @@ -0,0 +1,133 @@
50873 +/******************************************************************************
50874 + * arch/xen/drivers/netif/backend/common.h
50875 + * 
50876 + * This program is free software; you can redistribute it and/or
50877 + * modify it under the terms of the GNU General Public License version 2
50878 + * as published by the Free Software Foundation; or, when distributed
50879 + * separately from the Linux kernel or incorporated into other
50880 + * software packages, subject to the following license:
50881 + * 
50882 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50883 + * of this source file (the "Software"), to deal in the Software without
50884 + * restriction, including without limitation the rights to use, copy, modify,
50885 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50886 + * and to permit persons to whom the Software is furnished to do so, subject to
50887 + * the following conditions:
50888 + * 
50889 + * The above copyright notice and this permission notice shall be included in
50890 + * all copies or substantial portions of the Software.
50891 + * 
50892 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50893 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50894 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50895 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50896 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50897 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50898 + * IN THE SOFTWARE.
50899 + */
50900 +
50901 +#ifndef __NETIF__BACKEND__COMMON_H__
50902 +#define __NETIF__BACKEND__COMMON_H__
50903 +
50904 +#include <linux/config.h>
50905 +#include <linux/version.h>
50906 +#include <linux/module.h>
50907 +#include <linux/interrupt.h>
50908 +#include <linux/slab.h>
50909 +#include <linux/ip.h>
50910 +#include <linux/in.h>
50911 +#include <linux/netdevice.h>
50912 +#include <linux/etherdevice.h>
50913 +#include <xen/evtchn.h>
50914 +#include <xen/interface/io/netif.h>
50915 +#include <asm/io.h>
50916 +#include <asm/pgalloc.h>
50917 +#include <xen/interface/grant_table.h>
50918 +#include <xen/gnttab.h>
50919 +#include <xen/driver_util.h>
50920 +
50921 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
50922 +                                    __FILE__ , __LINE__ , ## _a )
50923 +#define IPRINTK(fmt, args...) \
50924 +    printk(KERN_INFO "xen_net: " fmt, ##args)
50925 +#define WPRINTK(fmt, args...) \
50926 +    printk(KERN_WARNING "xen_net: " fmt, ##args)
50927 +
50928 +typedef struct netif_st {
50929 +       /* Unique identifier for this interface. */
50930 +       domid_t          domid;
50931 +       unsigned int     handle;
50932 +
50933 +       u8               fe_dev_addr[6];
50934 +
50935 +       /* Physical parameters of the comms window. */
50936 +       grant_handle_t   tx_shmem_handle;
50937 +       grant_ref_t      tx_shmem_ref; 
50938 +       grant_handle_t   rx_shmem_handle;
50939 +       grant_ref_t      rx_shmem_ref; 
50940 +       unsigned int     evtchn;
50941 +       unsigned int     irq;
50942 +
50943 +       /* The shared rings and indexes. */
50944 +       netif_tx_back_ring_t tx;
50945 +       netif_rx_back_ring_t rx;
50946 +       struct vm_struct *tx_comms_area;
50947 +       struct vm_struct *rx_comms_area;
50948 +
50949 +       /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
50950 +       RING_IDX rx_req_cons_peek;
50951 +
50952 +       /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
50953 +       unsigned long   credit_bytes;
50954 +       unsigned long   credit_usec;
50955 +       unsigned long   remaining_credit;
50956 +       struct timer_list credit_timeout;
50957 +
50958 +       /* Miscellaneous private stuff. */
50959 +       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
50960 +       int active;
50961 +       struct list_head list;  /* scheduling list */
50962 +       atomic_t         refcnt;
50963 +       struct net_device *dev;
50964 +       struct net_device_stats stats;
50965 +
50966 +       struct work_struct free_work;
50967 +} netif_t;
50968 +
50969 +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
50970 +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
50971 +
50972 +void netif_disconnect(netif_t *netif);
50973 +
50974 +netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]);
50975 +void free_netif(netif_t *netif);
50976 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
50977 +             unsigned long rx_ring_ref, unsigned int evtchn);
50978 +
50979 +#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
50980 +#define netif_put(_b)                                          \
50981 +       do {                                                    \
50982 +               if ( atomic_dec_and_test(&(_b)->refcnt) )       \
50983 +                       free_netif(_b);                         \
50984 +       } while (0)
50985 +
50986 +void netif_xenbus_init(void);
50987 +
50988 +void netif_schedule_work(netif_t *netif);
50989 +void netif_deschedule_work(netif_t *netif);
50990 +
50991 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
50992 +struct net_device_stats *netif_be_get_stats(struct net_device *dev);
50993 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
50994 +
50995 +#endif /* __NETIF__BACKEND__COMMON_H__ */
50996 +
50997 +/*
50998 + * Local variables:
50999 + *  c-file-style: "linux"
51000 + *  indent-tabs-mode: t
51001 + *  c-indent-level: 8
51002 + *  c-basic-offset: 8
51003 + *  tab-width: 8
51004 + * End:
51005 + */
51006 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netback/interface.c tmp-linux-2.6-xen.patch/drivers/xen/netback/interface.c
51007 --- ref-linux-2.6.16.9/drivers/xen/netback/interface.c  1970-01-01 01:00:00.000000000 +0100
51008 +++ tmp-linux-2.6-xen.patch/drivers/xen/netback/interface.c     2006-04-10 00:05:52.000000000 +0200
51009 @@ -0,0 +1,334 @@
51010 +/******************************************************************************
51011 + * arch/xen/drivers/netif/backend/interface.c
51012 + * 
51013 + * Network-device interface management.
51014 + * 
51015 + * Copyright (c) 2004-2005, Keir Fraser
51016 + * 
51017 + * This program is free software; you can redistribute it and/or
51018 + * modify it under the terms of the GNU General Public License version 2
51019 + * as published by the Free Software Foundation; or, when distributed
51020 + * separately from the Linux kernel or incorporated into other
51021 + * software packages, subject to the following license:
51022 + * 
51023 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51024 + * of this source file (the "Software"), to deal in the Software without
51025 + * restriction, including without limitation the rights to use, copy, modify,
51026 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51027 + * and to permit persons to whom the Software is furnished to do so, subject to
51028 + * the following conditions:
51029 + * 
51030 + * The above copyright notice and this permission notice shall be included in
51031 + * all copies or substantial portions of the Software.
51032 + * 
51033 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51034 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51035 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51036 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51037 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51038 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51039 + * IN THE SOFTWARE.
51040 + */
51041 +
51042 +#include "common.h"
51043 +#include <linux/ethtool.h>
51044 +#include <linux/rtnetlink.h>
51045 +
51046 +static void __netif_up(netif_t *netif)
51047 +{
51048 +       struct net_device *dev = netif->dev;
51049 +       spin_lock_bh(&dev->xmit_lock);
51050 +       netif->active = 1;
51051 +       spin_unlock_bh(&dev->xmit_lock);
51052 +       enable_irq(netif->irq);
51053 +       netif_schedule_work(netif);
51054 +}
51055 +
51056 +static void __netif_down(netif_t *netif)
51057 +{
51058 +       struct net_device *dev = netif->dev;
51059 +       disable_irq(netif->irq);
51060 +       spin_lock_bh(&dev->xmit_lock);
51061 +       netif->active = 0;
51062 +       spin_unlock_bh(&dev->xmit_lock);
51063 +       netif_deschedule_work(netif);
51064 +}
51065 +
51066 +static int net_open(struct net_device *dev)
51067 +{
51068 +       netif_t *netif = netdev_priv(dev);
51069 +       if (netif->status == CONNECTED)
51070 +               __netif_up(netif);
51071 +       netif_start_queue(dev);
51072 +       return 0;
51073 +}
51074 +
51075 +static int net_close(struct net_device *dev)
51076 +{
51077 +       netif_t *netif = netdev_priv(dev);
51078 +       netif_stop_queue(dev);
51079 +       if (netif->status == CONNECTED)
51080 +               __netif_down(netif);
51081 +       return 0;
51082 +}
51083 +
51084 +static struct ethtool_ops network_ethtool_ops =
51085 +{
51086 +       .get_tx_csum = ethtool_op_get_tx_csum,
51087 +       .set_tx_csum = ethtool_op_set_tx_csum,
51088 +};
51089 +
51090 +netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN])
51091 +{
51092 +       int err = 0, i;
51093 +       struct net_device *dev;
51094 +       netif_t *netif;
51095 +       char name[IFNAMSIZ] = {};
51096 +
51097 +       snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
51098 +       dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
51099 +       if (dev == NULL) {
51100 +               DPRINTK("Could not create netif: out of memory\n");
51101 +               return ERR_PTR(-ENOMEM);
51102 +       }
51103 +
51104 +       netif = netdev_priv(dev);
51105 +       memset(netif, 0, sizeof(*netif));
51106 +       netif->domid  = domid;
51107 +       netif->handle = handle;
51108 +       netif->status = DISCONNECTED;
51109 +       atomic_set(&netif->refcnt, 0);
51110 +       netif->dev = dev;
51111 +
51112 +       netif->credit_bytes = netif->remaining_credit = ~0UL;
51113 +       netif->credit_usec  = 0UL;
51114 +       init_timer(&netif->credit_timeout);
51115 +
51116 +       dev->hard_start_xmit = netif_be_start_xmit;
51117 +       dev->get_stats       = netif_be_get_stats;
51118 +       dev->open            = net_open;
51119 +       dev->stop            = net_close;
51120 +       dev->features        = NETIF_F_IP_CSUM;
51121 +
51122 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
51123 +
51124 +       /* Disable queuing. */
51125 +       dev->tx_queue_len = 0;
51126 +
51127 +       for (i = 0; i < ETH_ALEN; i++)
51128 +               if (be_mac[i] != 0)
51129 +                       break;
51130 +       if (i == ETH_ALEN) {
51131 +               /*
51132 +                * Initialise a dummy MAC address. We choose the numerically
51133 +                * largest non-broadcast address to prevent the address getting
51134 +                * stolen by an Ethernet bridge for STP purposes.
51135 +                 * (FE:FF:FF:FF:FF:FF) 
51136 +                */ 
51137 +               memset(dev->dev_addr, 0xFF, ETH_ALEN);
51138 +               dev->dev_addr[0] &= ~0x01;
51139 +       } else
51140 +               memcpy(dev->dev_addr, be_mac, ETH_ALEN);
51141 +
51142 +       rtnl_lock();
51143 +       err = register_netdevice(dev);
51144 +       rtnl_unlock();
51145 +       if (err) {
51146 +               DPRINTK("Could not register new net device %s: err=%d\n",
51147 +                       dev->name, err);
51148 +               free_netdev(dev);
51149 +               return ERR_PTR(err);
51150 +       }
51151 +
51152 +       DPRINTK("Successfully created netif\n");
51153 +       return netif;
51154 +}
51155 +
51156 +static int map_frontend_pages(
51157 +       netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
51158 +{
51159 +       struct gnttab_map_grant_ref op;
51160 +       int ret;
51161 +
51162 +       op.host_addr = (unsigned long)netif->tx_comms_area->addr;
51163 +       op.flags     = GNTMAP_host_map;
51164 +       op.ref       = tx_ring_ref;
51165 +       op.dom       = netif->domid;
51166 +    
51167 +       lock_vm_area(netif->tx_comms_area);
51168 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
51169 +       unlock_vm_area(netif->tx_comms_area);
51170 +       BUG_ON(ret);
51171 +
51172 +       if (op.status) { 
51173 +               DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
51174 +               return op.status;
51175 +       }
51176 +
51177 +       netif->tx_shmem_ref    = tx_ring_ref;
51178 +       netif->tx_shmem_handle = op.handle;
51179 +
51180 +       op.host_addr = (unsigned long)netif->rx_comms_area->addr;
51181 +       op.flags     = GNTMAP_host_map;
51182 +       op.ref       = rx_ring_ref;
51183 +       op.dom       = netif->domid;
51184 +
51185 +       lock_vm_area(netif->rx_comms_area);
51186 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
51187 +       unlock_vm_area(netif->rx_comms_area);
51188 +       BUG_ON(ret);
51189 +
51190 +       if (op.status) {
51191 +               DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
51192 +               return op.status;
51193 +       }
51194 +
51195 +       netif->rx_shmem_ref    = rx_ring_ref;
51196 +       netif->rx_shmem_handle = op.handle;
51197 +
51198 +       return 0;
51199 +}
51200 +
51201 +static void unmap_frontend_pages(netif_t *netif)
51202 +{
51203 +       struct gnttab_unmap_grant_ref op;
51204 +       int ret;
51205 +
51206 +       op.host_addr    = (unsigned long)netif->tx_comms_area->addr;
51207 +       op.handle       = netif->tx_shmem_handle;
51208 +       op.dev_bus_addr = 0;
51209 +
51210 +       lock_vm_area(netif->tx_comms_area);
51211 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
51212 +       unlock_vm_area(netif->tx_comms_area);
51213 +       BUG_ON(ret);
51214 +
51215 +       op.host_addr    = (unsigned long)netif->rx_comms_area->addr;
51216 +       op.handle       = netif->rx_shmem_handle;
51217 +       op.dev_bus_addr = 0;
51218 +
51219 +       lock_vm_area(netif->rx_comms_area);
51220 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
51221 +       unlock_vm_area(netif->rx_comms_area);
51222 +       BUG_ON(ret);
51223 +}
51224 +
51225 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
51226 +             unsigned long rx_ring_ref, unsigned int evtchn)
51227 +{
51228 +       int err = -ENOMEM;
51229 +       netif_tx_sring_t *txs;
51230 +       netif_rx_sring_t *rxs;
51231 +       evtchn_op_t op = {
51232 +               .cmd = EVTCHNOP_bind_interdomain,
51233 +               .u.bind_interdomain.remote_dom = netif->domid,
51234 +               .u.bind_interdomain.remote_port = evtchn };
51235 +
51236 +       /* Already connected through? */
51237 +       if (netif->irq)
51238 +               return 0;
51239 +
51240 +       netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
51241 +       if (netif->tx_comms_area == NULL)
51242 +               return -ENOMEM;
51243 +       netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
51244 +       if (netif->rx_comms_area == NULL)
51245 +               goto err_rx;
51246 +
51247 +       err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
51248 +       if (err)
51249 +               goto err_map;
51250 +
51251 +       err = HYPERVISOR_event_channel_op(&op);
51252 +       if (err)
51253 +               goto err_hypervisor;
51254 +
51255 +       netif->evtchn = op.u.bind_interdomain.local_port;
51256 +
51257 +       netif->irq = bind_evtchn_to_irqhandler(
51258 +               netif->evtchn, netif_be_int, 0, netif->dev->name, netif);
51259 +       disable_irq(netif->irq);
51260 +
51261 +       txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
51262 +       BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
51263 +
51264 +       rxs = (netif_rx_sring_t *)
51265 +               ((char *)netif->rx_comms_area->addr);
51266 +       BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
51267 +
51268 +       netif->rx_req_cons_peek = 0;
51269 +
51270 +       netif_get(netif);
51271 +       wmb(); /* Other CPUs see new state before interface is started. */
51272 +
51273 +       rtnl_lock();
51274 +       netif->status = CONNECTED;
51275 +       wmb();
51276 +       if (netif_running(netif->dev))
51277 +               __netif_up(netif);
51278 +       rtnl_unlock();
51279 +
51280 +       return 0;
51281 +err_hypervisor:
51282 +       unmap_frontend_pages(netif);
51283 +err_map:
51284 +       free_vm_area(netif->rx_comms_area);
51285 +err_rx:
51286 +       free_vm_area(netif->tx_comms_area);
51287 +       return err;
51288 +}
51289 +
51290 +static void free_netif_callback(void *arg)
51291 +{
51292 +       netif_t *netif = (netif_t *)arg;
51293 +
51294 +       if (netif->irq)
51295 +               unbind_from_irqhandler(netif->irq, netif);
51296 +       
51297 +       unregister_netdev(netif->dev);
51298 +
51299 +       if (netif->tx.sring) {
51300 +               unmap_frontend_pages(netif);
51301 +               free_vm_area(netif->tx_comms_area);
51302 +               free_vm_area(netif->rx_comms_area);
51303 +       }
51304 +
51305 +       free_netdev(netif->dev);
51306 +}
51307 +
51308 +void free_netif(netif_t *netif)
51309 +{
51310 +       INIT_WORK(&netif->free_work, free_netif_callback, (void *)netif);
51311 +       schedule_work(&netif->free_work);
51312 +}
51313 +
51314 +void netif_disconnect(netif_t *netif)
51315 +{
51316 +       switch (netif->status) {
51317 +       case CONNECTED:
51318 +               rtnl_lock();
51319 +               netif->status = DISCONNECTING;
51320 +               wmb();
51321 +               if (netif_running(netif->dev))
51322 +                       __netif_down(netif);
51323 +               rtnl_unlock();
51324 +               netif_put(netif);
51325 +               break;
51326 +       case DISCONNECTED:
51327 +               BUG_ON(atomic_read(&netif->refcnt) != 0);
51328 +               free_netif(netif);
51329 +               break;
51330 +       default:
51331 +               BUG();
51332 +       }
51333 +}
51334 +
51335 +/*
51336 + * Local variables:
51337 + *  c-file-style: "linux"
51338 + *  indent-tabs-mode: t
51339 + *  c-indent-level: 8
51340 + *  c-basic-offset: 8
51341 + *  tab-width: 8
51342 + * End:
51343 + */
51344 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netback/loopback.c tmp-linux-2.6-xen.patch/drivers/xen/netback/loopback.c
51345 --- ref-linux-2.6.16.9/drivers/xen/netback/loopback.c   1970-01-01 01:00:00.000000000 +0100
51346 +++ tmp-linux-2.6-xen.patch/drivers/xen/netback/loopback.c      2006-04-10 00:05:52.000000000 +0200
51347 @@ -0,0 +1,254 @@
51348 +/******************************************************************************
51349 + * netback/loopback.c
51350 + * 
51351 + * A two-interface loopback device to emulate a local netfront-netback
51352 + * connection. This ensures that local packet delivery looks identical
51353 + * to inter-domain delivery. Most importantly, packets delivered locally
51354 + * originating from other domains will get *copied* when they traverse this
51355 + * driver. This prevents unbounded delays in socket-buffer queues from
51356 + * causing the netback driver to "seize up".
51357 + * 
51358 + * This driver creates a symmetric pair of loopback interfaces with names
51359 + * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
51360 + * bridge, just like a proper netback interface, while a local IP interface
51361 + * is configured on 'veth0'.
51362 + * 
51363 + * As with a real netback interface, vif0.0 is configured with a suitable
51364 + * dummy MAC address. No default is provided for veth0: a reasonable strategy
51365 + * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
51366 + * (to avoid confusing the Etherbridge).
51367 + * 
51368 + * Copyright (c) 2005 K A Fraser
51369 + * 
51370 + * This program is free software; you can redistribute it and/or
51371 + * modify it under the terms of the GNU General Public License version 2
51372 + * as published by the Free Software Foundation; or, when distributed
51373 + * separately from the Linux kernel or incorporated into other
51374 + * software packages, subject to the following license:
51375 + * 
51376 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51377 + * of this source file (the "Software"), to deal in the Software without
51378 + * restriction, including without limitation the rights to use, copy, modify,
51379 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51380 + * and to permit persons to whom the Software is furnished to do so, subject to
51381 + * the following conditions:
51382 + * 
51383 + * The above copyright notice and this permission notice shall be included in
51384 + * all copies or substantial portions of the Software.
51385 + * 
51386 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51387 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51388 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51389 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51390 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51391 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51392 + * IN THE SOFTWARE.
51393 + */
51394 +
51395 +#include <linux/config.h>
51396 +#include <linux/module.h>
51397 +#include <linux/netdevice.h>
51398 +#include <linux/inetdevice.h>
51399 +#include <linux/etherdevice.h>
51400 +#include <linux/skbuff.h>
51401 +#include <linux/ethtool.h>
51402 +#include <net/dst.h>
51403 +
51404 +static int nloopbacks = 8;
51405 +module_param(nloopbacks, int, 0);
51406 +MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
51407 +
51408 +struct net_private {
51409 +       struct net_device *loopback_dev;
51410 +       struct net_device_stats stats;
51411 +};
51412 +
51413 +static int loopback_open(struct net_device *dev)
51414 +{
51415 +       struct net_private *np = netdev_priv(dev);
51416 +       memset(&np->stats, 0, sizeof(np->stats));
51417 +       netif_start_queue(dev);
51418 +       return 0;
51419 +}
51420 +
51421 +static int loopback_close(struct net_device *dev)
51422 +{
51423 +       netif_stop_queue(dev);
51424 +       return 0;
51425 +}
51426 +
51427 +static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
51428 +{
51429 +       struct net_private *np = netdev_priv(dev);
51430 +
51431 +       dst_release(skb->dst);
51432 +       skb->dst = NULL;
51433 +
51434 +       skb_orphan(skb);
51435 +
51436 +       np->stats.tx_bytes += skb->len;
51437 +       np->stats.tx_packets++;
51438 +
51439 +       /* Switch to loopback context. */
51440 +       dev = np->loopback_dev;
51441 +       np  = netdev_priv(dev);
51442 +
51443 +       np->stats.rx_bytes += skb->len;
51444 +       np->stats.rx_packets++;
51445 +
51446 +       if (skb->ip_summed == CHECKSUM_HW) {
51447 +               /* Defer checksum calculation. */
51448 +               skb->proto_csum_blank = 1;
51449 +               /* Must be a local packet: assert its integrity. */
51450 +               skb->proto_data_valid = 1;
51451 +       }
51452 +
51453 +       skb->ip_summed = skb->proto_data_valid ?
51454 +               CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
51455 +
51456 +       skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
51457 +       skb->protocol = eth_type_trans(skb, dev);
51458 +       skb->dev      = dev;
51459 +       dev->last_rx  = jiffies;
51460 +       netif_rx(skb);
51461 +
51462 +       return 0;
51463 +}
51464 +
51465 +static struct net_device_stats *loopback_get_stats(struct net_device *dev)
51466 +{
51467 +       struct net_private *np = netdev_priv(dev);
51468 +       return &np->stats;
51469 +}
51470 +
51471 +static struct ethtool_ops network_ethtool_ops =
51472 +{
51473 +       .get_tx_csum = ethtool_op_get_tx_csum,
51474 +       .set_tx_csum = ethtool_op_set_tx_csum,
51475 +};
51476 +
51477 +static void loopback_construct(struct net_device *dev, struct net_device *lo)
51478 +{
51479 +       struct net_private *np = netdev_priv(dev);
51480 +
51481 +       np->loopback_dev     = lo;
51482 +
51483 +       dev->open            = loopback_open;
51484 +       dev->stop            = loopback_close;
51485 +       dev->hard_start_xmit = loopback_start_xmit;
51486 +       dev->get_stats       = loopback_get_stats;
51487 +
51488 +       dev->tx_queue_len    = 0;
51489 +
51490 +       dev->features        = (NETIF_F_HIGHDMA |
51491 +                               NETIF_F_LLTX |
51492 +                               NETIF_F_IP_CSUM);
51493 +
51494 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
51495 +
51496 +       /*
51497 +        * We do not set a jumbo MTU on the interface. Otherwise the network
51498 +        * stack will try to send large packets that will get dropped by the
51499 +        * Ethernet bridge (unless the physical Ethernet interface is
51500 +        * configured to transfer jumbo packets). If a larger MTU is desired
51501 +        * then the system administrator can specify it using the 'ifconfig'
51502 +        * command.
51503 +        */
51504 +       /*dev->mtu             = 16*1024;*/
51505 +}
51506 +
51507 +static int __init make_loopback(int i)
51508 +{
51509 +       struct net_device *dev1, *dev2;
51510 +       char dev_name[IFNAMSIZ];
51511 +       int err = -ENOMEM;
51512 +
51513 +       sprintf(dev_name, "vif0.%d", i);
51514 +       dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
51515 +       if (!dev1)
51516 +               return err;
51517 +
51518 +       sprintf(dev_name, "veth%d", i);
51519 +       dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
51520 +       if (!dev2)
51521 +               goto fail_netdev2;
51522 +
51523 +       loopback_construct(dev1, dev2);
51524 +       loopback_construct(dev2, dev1);
51525 +
51526 +       /*
51527 +        * Initialise a dummy MAC address for the 'dummy backend' interface. We
51528 +        * choose the numerically largest non-broadcast address to prevent the
51529 +        * address getting stolen by an Ethernet bridge for STP purposes.
51530 +        */
51531 +       memset(dev1->dev_addr, 0xFF, ETH_ALEN);
51532 +       dev1->dev_addr[0] &= ~0x01;
51533 +
51534 +       if ((err = register_netdev(dev1)) != 0)
51535 +               goto fail;
51536 +
51537 +       if ((err = register_netdev(dev2)) != 0) {
51538 +               unregister_netdev(dev1);
51539 +               goto fail;
51540 +       }
51541 +
51542 +       return 0;
51543 +
51544 + fail:
51545 +       free_netdev(dev2);
51546 + fail_netdev2:
51547 +       free_netdev(dev1);
51548 +       return err;
51549 +}
51550 +
51551 +static void __init clean_loopback(int i)
51552 +{
51553 +       struct net_device *dev1, *dev2;
51554 +       char dev_name[IFNAMSIZ];
51555 +
51556 +       sprintf(dev_name, "vif0.%d", i);
51557 +       dev1 = dev_get_by_name(dev_name);
51558 +       sprintf(dev_name, "veth%d", i);
51559 +       dev2 = dev_get_by_name(dev_name);
51560 +       if (dev1 && dev2) {
51561 +               unregister_netdev(dev2);
51562 +               unregister_netdev(dev1);
51563 +               free_netdev(dev2);
51564 +               free_netdev(dev1);
51565 +       }
51566 +}
51567 +
51568 +static int __init loopback_init(void)
51569 +{
51570 +       int i, err = 0;
51571 +
51572 +       for (i = 0; i < nloopbacks; i++)
51573 +               if ((err = make_loopback(i)) != 0)
51574 +                       break;
51575 +
51576 +       return err;
51577 +}
51578 +
51579 +module_init(loopback_init);
51580 +
51581 +static void __exit loopback_exit(void)
51582 +{
51583 +       int i;
51584 +
51585 +       for (i = nloopbacks; i-- > 0; )
51586 +               clean_loopback(i);
51587 +}
51588 +
51589 +module_exit(loopback_exit);
51590 +
51591 +MODULE_LICENSE("Dual BSD/GPL");
51592 +
51593 +/*
51594 + * Local variables:
51595 + *  c-file-style: "linux"
51596 + *  indent-tabs-mode: t
51597 + *  c-indent-level: 8
51598 + *  c-basic-offset: 8
51599 + *  tab-width: 8
51600 + * End:
51601 + */
51602 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netback/Makefile tmp-linux-2.6-xen.patch/drivers/xen/netback/Makefile
51603 --- ref-linux-2.6.16.9/drivers/xen/netback/Makefile     1970-01-01 01:00:00.000000000 +0100
51604 +++ tmp-linux-2.6-xen.patch/drivers/xen/netback/Makefile        2006-04-10 00:05:52.000000000 +0200
51605 @@ -0,0 +1,5 @@
51606 +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
51607 +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
51608 +
51609 +netbk-y   := netback.o xenbus.o interface.o
51610 +netloop-y := loopback.o
51611 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netback/netback.c tmp-linux-2.6-xen.patch/drivers/xen/netback/netback.c
51612 --- ref-linux-2.6.16.9/drivers/xen/netback/netback.c    1970-01-01 01:00:00.000000000 +0100
51613 +++ tmp-linux-2.6-xen.patch/drivers/xen/netback/netback.c       2006-04-10 00:05:52.000000000 +0200
51614 @@ -0,0 +1,868 @@
51615 +/******************************************************************************
51616 + * drivers/xen/netback/netback.c
51617 + * 
51618 + * Back-end of the driver for virtual network devices. This portion of the
51619 + * driver exports a 'unified' network-device interface that can be accessed
51620 + * by any operating system that implements a compatible front end. A 
51621 + * reference front-end implementation can be found in:
51622 + *  drivers/xen/netfront/netfront.c
51623 + * 
51624 + * Copyright (c) 2002-2005, K A Fraser
51625 + * 
51626 + * This program is free software; you can redistribute it and/or
51627 + * modify it under the terms of the GNU General Public License version 2
51628 + * as published by the Free Software Foundation; or, when distributed
51629 + * separately from the Linux kernel or incorporated into other
51630 + * software packages, subject to the following license:
51631 + * 
51632 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51633 + * of this source file (the "Software"), to deal in the Software without
51634 + * restriction, including without limitation the rights to use, copy, modify,
51635 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51636 + * and to permit persons to whom the Software is furnished to do so, subject to
51637 + * the following conditions:
51638 + * 
51639 + * The above copyright notice and this permission notice shall be included in
51640 + * all copies or substantial portions of the Software.
51641 + * 
51642 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51643 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51644 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51645 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51646 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51647 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51648 + * IN THE SOFTWARE.
51649 + */
51650 +
51651 +#include "common.h"
51652 +#include <xen/balloon.h>
51653 +#include <xen/interface/memory.h>
51654 +
51655 +/*#define NETBE_DEBUG_INTERRUPT*/
51656 +
51657 +static void netif_idx_release(u16 pending_idx);
51658 +static void netif_page_release(struct page *page);
51659 +static void make_tx_response(netif_t *netif, 
51660 +                             u16      id,
51661 +                             s8       st);
51662 +static int  make_rx_response(netif_t *netif, 
51663 +                             u16      id, 
51664 +                             s8       st,
51665 +                             u16      offset,
51666 +                             u16      size,
51667 +                             u16      flags);
51668 +
51669 +static void net_tx_action(unsigned long unused);
51670 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
51671 +
51672 +static void net_rx_action(unsigned long unused);
51673 +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
51674 +
51675 +static struct timer_list net_timer;
51676 +
51677 +#define MAX_PENDING_REQS 256
51678 +
51679 +static struct sk_buff_head rx_queue;
51680 +static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
51681 +static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
51682 +static gnttab_transfer_t grant_rx_op[NET_RX_RING_SIZE];
51683 +static unsigned char rx_notify[NR_IRQS];
51684 +
51685 +static unsigned long mmap_vstart;
51686 +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
51687 +
51688 +#define PKT_PROT_LEN 64
51689 +
51690 +static struct {
51691 +       netif_tx_request_t req;
51692 +       netif_t *netif;
51693 +} pending_tx_info[MAX_PENDING_REQS];
51694 +static u16 pending_ring[MAX_PENDING_REQS];
51695 +typedef unsigned int PEND_RING_IDX;
51696 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
51697 +static PEND_RING_IDX pending_prod, pending_cons;
51698 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
51699 +
51700 +/* Freed TX SKBs get batched on this ring before return to pending_ring. */
51701 +static u16 dealloc_ring[MAX_PENDING_REQS];
51702 +static PEND_RING_IDX dealloc_prod, dealloc_cons;
51703 +
51704 +static struct sk_buff_head tx_queue;
51705 +
51706 +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
51707 +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
51708 +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
51709 +
51710 +static struct list_head net_schedule_list;
51711 +static spinlock_t net_schedule_list_lock;
51712 +
51713 +#define MAX_MFN_ALLOC 64
51714 +static unsigned long mfn_list[MAX_MFN_ALLOC];
51715 +static unsigned int alloc_index = 0;
51716 +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
51717 +
51718 +static unsigned long alloc_mfn(void)
51719 +{
51720 +       unsigned long mfn = 0, flags;
51721 +       struct xen_memory_reservation reservation = {
51722 +               .extent_start = mfn_list,
51723 +               .nr_extents   = MAX_MFN_ALLOC,
51724 +               .extent_order = 0,
51725 +               .domid        = DOMID_SELF
51726 +       };
51727 +       spin_lock_irqsave(&mfn_lock, flags);
51728 +       if ( unlikely(alloc_index == 0) )
51729 +               alloc_index = HYPERVISOR_memory_op(
51730 +                       XENMEM_increase_reservation, &reservation);
51731 +       if ( alloc_index != 0 )
51732 +               mfn = mfn_list[--alloc_index];
51733 +       spin_unlock_irqrestore(&mfn_lock, flags);
51734 +       return mfn;
51735 +}
51736 +
51737 +static inline void maybe_schedule_tx_action(void)
51738 +{
51739 +       smp_mb();
51740 +       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
51741 +           !list_empty(&net_schedule_list))
51742 +               tasklet_schedule(&net_tx_tasklet);
51743 +}
51744 +
51745 +/*
51746 + * A gross way of confirming the origin of an skb data page. The slab
51747 + * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
51748 + */
51749 +static inline int is_xen_skb(struct sk_buff *skb)
51750 +{
51751 +       extern kmem_cache_t *skbuff_cachep;
51752 +       kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
51753 +       return (cp == skbuff_cachep);
51754 +}
51755 +
51756 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
51757 +{
51758 +       netif_t *netif = netdev_priv(dev);
51759 +
51760 +       BUG_ON(skb->dev != dev);
51761 +
51762 +       /* Drop the packet if the target domain has no receive buffers. */
51763 +       if (!netif->active || 
51764 +           (netif->rx_req_cons_peek == netif->rx.sring->req_prod) ||
51765 +           ((netif->rx_req_cons_peek - netif->rx.rsp_prod_pvt) ==
51766 +            NET_RX_RING_SIZE))
51767 +               goto drop;
51768 +
51769 +       /*
51770 +        * We do not copy the packet unless:
51771 +        *  1. The data is shared; or
51772 +        *  2. The data is not allocated from our special cache.
51773 +        * NB. We also couldn't cope with fragmented packets, but we won't get
51774 +        *     any because we not advertise the NETIF_F_SG feature.
51775 +        */
51776 +       if (skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb)) {
51777 +               int hlen = skb->data - skb->head;
51778 +               int ret;
51779 +               struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len);
51780 +               if ( unlikely(nskb == NULL) )
51781 +                       goto drop;
51782 +               skb_reserve(nskb, hlen);
51783 +               __skb_put(nskb, skb->len);
51784 +               ret = skb_copy_bits(skb, -hlen, nskb->data - hlen,
51785 +                                    skb->len + hlen);
51786 +               BUG_ON(ret);
51787 +               nskb->dev = skb->dev;
51788 +               nskb->proto_data_valid = skb->proto_data_valid;
51789 +               dev_kfree_skb(skb);
51790 +               skb = nskb;
51791 +       }
51792 +
51793 +       netif->rx_req_cons_peek++;
51794 +       netif_get(netif);
51795 +
51796 +       skb_queue_tail(&rx_queue, skb);
51797 +       tasklet_schedule(&net_rx_tasklet);
51798 +
51799 +       return 0;
51800 +
51801 + drop:
51802 +       netif->stats.tx_dropped++;
51803 +       dev_kfree_skb(skb);
51804 +       return 0;
51805 +}
51806 +
51807 +#if 0
51808 +static void xen_network_done_notify(void)
51809 +{
51810 +       static struct net_device *eth0_dev = NULL;
51811 +       if (unlikely(eth0_dev == NULL))
51812 +               eth0_dev = __dev_get_by_name("eth0");
51813 +       netif_rx_schedule(eth0_dev);
51814 +}
51815 +/* 
51816 + * Add following to poll() function in NAPI driver (Tigon3 is example):
51817 + *  if ( xen_network_done() )
51818 + *      tg3_enable_ints(tp); 
51819 + */
51820 +int xen_network_done(void)
51821 +{
51822 +       return skb_queue_empty(&rx_queue);
51823 +}
51824 +#endif
51825 +
51826 +static void net_rx_action(unsigned long unused)
51827 +{
51828 +       netif_t *netif = NULL; 
51829 +       s8 status;
51830 +       u16 size, id, irq, flags;
51831 +       multicall_entry_t *mcl;
51832 +       mmu_update_t *mmu;
51833 +       gnttab_transfer_t *gop;
51834 +       unsigned long vdata, old_mfn, new_mfn;
51835 +       struct sk_buff_head rxq;
51836 +       struct sk_buff *skb;
51837 +       u16 notify_list[NET_RX_RING_SIZE];
51838 +       int notify_nr = 0;
51839 +       int ret;
51840 +
51841 +       skb_queue_head_init(&rxq);
51842 +
51843 +       mcl = rx_mcl;
51844 +       mmu = rx_mmu;
51845 +       gop = grant_rx_op;
51846 +
51847 +       while ((skb = skb_dequeue(&rx_queue)) != NULL) {
51848 +               netif   = netdev_priv(skb->dev);
51849 +               vdata   = (unsigned long)skb->data;
51850 +               old_mfn = virt_to_mfn(vdata);
51851 +
51852 +               /* Memory squeeze? Back off for an arbitrary while. */
51853 +               if ((new_mfn = alloc_mfn()) == 0) {
51854 +                       if ( net_ratelimit() )
51855 +                               WPRINTK("Memory squeeze in netback driver.\n");
51856 +                       mod_timer(&net_timer, jiffies + HZ);
51857 +                       skb_queue_head(&rx_queue, skb);
51858 +                       break;
51859 +               }
51860 +               /*
51861 +                * Set the new P2M table entry before reassigning the old data
51862 +                * page. Heed the comment in pgtable-2level.h:pte_page(). :-)
51863 +                */
51864 +               set_phys_to_machine(__pa(skb->data) >> PAGE_SHIFT, new_mfn);
51865 +
51866 +               MULTI_update_va_mapping(mcl, vdata,
51867 +                                       pfn_pte_ma(new_mfn, PAGE_KERNEL), 0);
51868 +               mcl++;
51869 +
51870 +               gop->mfn = old_mfn;
51871 +               gop->domid = netif->domid;
51872 +               gop->ref = RING_GET_REQUEST(
51873 +                       &netif->rx, netif->rx.req_cons)->gref;
51874 +               netif->rx.req_cons++;
51875 +               gop++;
51876 +
51877 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
51878 +                       mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
51879 +                               MMU_MACHPHYS_UPDATE;
51880 +                       mmu->val = __pa(vdata) >> PAGE_SHIFT;
51881 +                       mmu++;
51882 +               }
51883 +
51884 +               __skb_queue_tail(&rxq, skb);
51885 +
51886 +               /* Filled the batch queue? */
51887 +               if ((gop - grant_rx_op) == ARRAY_SIZE(grant_rx_op))
51888 +                       break;
51889 +       }
51890 +
51891 +       if (mcl == rx_mcl)
51892 +               return;
51893 +
51894 +       mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
51895 +
51896 +       if (mmu - rx_mmu) {
51897 +               mcl->op = __HYPERVISOR_mmu_update;
51898 +               mcl->args[0] = (unsigned long)rx_mmu;
51899 +               mcl->args[1] = mmu - rx_mmu;
51900 +               mcl->args[2] = 0;
51901 +               mcl->args[3] = DOMID_SELF;
51902 +               mcl++;
51903 +       }
51904 +
51905 +       ret = HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
51906 +       BUG_ON(ret != 0);
51907 +
51908 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, 
51909 +                                       gop - grant_rx_op);
51910 +       BUG_ON(ret != 0);
51911 +
51912 +       mcl = rx_mcl;
51913 +       gop = grant_rx_op;
51914 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
51915 +               netif   = netdev_priv(skb->dev);
51916 +               size    = skb->tail - skb->data;
51917 +
51918 +               /* Rederive the machine addresses. */
51919 +               new_mfn = mcl->args[1] >> PAGE_SHIFT;
51920 +               old_mfn = gop->mfn;
51921 +               atomic_set(&(skb_shinfo(skb)->dataref), 1);
51922 +               skb_shinfo(skb)->nr_frags = 0;
51923 +               skb_shinfo(skb)->frag_list = NULL;
51924 +
51925 +               netif->stats.tx_bytes += size;
51926 +               netif->stats.tx_packets++;
51927 +
51928 +               /* The update_va_mapping() must not fail. */
51929 +               BUG_ON(mcl->result != 0);
51930 +
51931 +               /* Check the reassignment error code. */
51932 +               status = NETIF_RSP_OKAY;
51933 +               if (gop->status != 0) { 
51934 +                       DPRINTK("Bad status %d from grant transfer to DOM%u\n",
51935 +                               gop->status, netif->domid);
51936 +                       /*
51937 +                         * Page no longer belongs to us unless GNTST_bad_page,
51938 +                         * but that should be a fatal error anyway.
51939 +                         */
51940 +                       BUG_ON(gop->status == GNTST_bad_page);
51941 +                       status = NETIF_RSP_ERROR; 
51942 +               }
51943 +               irq = netif->irq;
51944 +               id = RING_GET_REQUEST(&netif->rx, netif->rx.rsp_prod_pvt)->id;
51945 +               flags = 0;
51946 +               if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
51947 +                       flags |= NETRXF_csum_blank | NETRXF_data_validated;
51948 +               else if (skb->proto_data_valid) /* remote but checksummed? */
51949 +                       flags |= NETRXF_data_validated;
51950 +               if (make_rx_response(netif, id, status,
51951 +                                    (unsigned long)skb->data & ~PAGE_MASK,
51952 +                                    size, flags) &&
51953 +                   (rx_notify[irq] == 0)) {
51954 +                       rx_notify[irq] = 1;
51955 +                       notify_list[notify_nr++] = irq;
51956 +               }
51957 +
51958 +               netif_put(netif);
51959 +               dev_kfree_skb(skb);
51960 +               mcl++;
51961 +               gop++;
51962 +       }
51963 +
51964 +       while (notify_nr != 0) {
51965 +               irq = notify_list[--notify_nr];
51966 +               rx_notify[irq] = 0;
51967 +               notify_remote_via_irq(irq);
51968 +       }
51969 +
51970 +       /* More work to do? */
51971 +       if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
51972 +               tasklet_schedule(&net_rx_tasklet);
51973 +#if 0
51974 +       else
51975 +               xen_network_done_notify();
51976 +#endif
51977 +}
51978 +
51979 +static void net_alarm(unsigned long unused)
51980 +{
51981 +       tasklet_schedule(&net_rx_tasklet);
51982 +}
51983 +
51984 +struct net_device_stats *netif_be_get_stats(struct net_device *dev)
51985 +{
51986 +       netif_t *netif = netdev_priv(dev);
51987 +       return &netif->stats;
51988 +}
51989 +
51990 +static int __on_net_schedule_list(netif_t *netif)
51991 +{
51992 +       return netif->list.next != NULL;
51993 +}
51994 +
51995 +static void remove_from_net_schedule_list(netif_t *netif)
51996 +{
51997 +       spin_lock_irq(&net_schedule_list_lock);
51998 +       if (likely(__on_net_schedule_list(netif))) {
51999 +               list_del(&netif->list);
52000 +               netif->list.next = NULL;
52001 +               netif_put(netif);
52002 +       }
52003 +       spin_unlock_irq(&net_schedule_list_lock);
52004 +}
52005 +
52006 +static void add_to_net_schedule_list_tail(netif_t *netif)
52007 +{
52008 +       if (__on_net_schedule_list(netif))
52009 +               return;
52010 +
52011 +       spin_lock_irq(&net_schedule_list_lock);
52012 +       if (!__on_net_schedule_list(netif) && netif->active) {
52013 +               list_add_tail(&netif->list, &net_schedule_list);
52014 +               netif_get(netif);
52015 +       }
52016 +       spin_unlock_irq(&net_schedule_list_lock);
52017 +}
52018 +
52019 +/*
52020 + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
52021 + * If this driver is pipelining transmit requests then we can be very
52022 + * aggressive in avoiding new-packet notifications -- frontend only needs to
52023 + * send a notification if there are no outstanding unreceived responses.
52024 + * If we may be buffer transmit buffers for any reason then we must be rather
52025 + * more conservative and treat this as the final check for pending work.
52026 + */
52027 +void netif_schedule_work(netif_t *netif)
52028 +{
52029 +       int more_to_do;
52030 +
52031 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
52032 +       more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
52033 +#else
52034 +       RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
52035 +#endif
52036 +
52037 +       if (more_to_do) {
52038 +               add_to_net_schedule_list_tail(netif);
52039 +               maybe_schedule_tx_action();
52040 +       }
52041 +}
52042 +
52043 +void netif_deschedule_work(netif_t *netif)
52044 +{
52045 +       remove_from_net_schedule_list(netif);
52046 +}
52047 +
52048 +
52049 +static void tx_credit_callback(unsigned long data)
52050 +{
52051 +       netif_t *netif = (netif_t *)data;
52052 +       netif->remaining_credit = netif->credit_bytes;
52053 +       netif_schedule_work(netif);
52054 +}
52055 +
52056 +inline static void net_tx_action_dealloc(void)
52057 +{
52058 +       gnttab_unmap_grant_ref_t *gop;
52059 +       u16 pending_idx;
52060 +       PEND_RING_IDX dc, dp;
52061 +       netif_t *netif;
52062 +       int ret;
52063 +
52064 +       dc = dealloc_cons;
52065 +       dp = dealloc_prod;
52066 +
52067 +       /*
52068 +        * Free up any grants we have finished using
52069 +        */
52070 +       gop = tx_unmap_ops;
52071 +       while (dc != dp) {
52072 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
52073 +               gop->host_addr    = MMAP_VADDR(pending_idx);
52074 +               gop->dev_bus_addr = 0;
52075 +               gop->handle       = grant_tx_handle[pending_idx];
52076 +               gop++;
52077 +       }
52078 +       ret = HYPERVISOR_grant_table_op(
52079 +               GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
52080 +       BUG_ON(ret);
52081 +
52082 +       while (dealloc_cons != dp) {
52083 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
52084 +
52085 +               netif = pending_tx_info[pending_idx].netif;
52086 +
52087 +               make_tx_response(netif, pending_tx_info[pending_idx].req.id, 
52088 +                                NETIF_RSP_OKAY);
52089 +        
52090 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
52091 +
52092 +               netif_put(netif);
52093 +       }
52094 +}
52095 +
52096 +/* Called after netfront has transmitted */
52097 +static void net_tx_action(unsigned long unused)
52098 +{
52099 +       struct list_head *ent;
52100 +       struct sk_buff *skb;
52101 +       netif_t *netif;
52102 +       netif_tx_request_t txreq;
52103 +       u16 pending_idx;
52104 +       RING_IDX i;
52105 +       gnttab_map_grant_ref_t *mop;
52106 +       unsigned int data_len;
52107 +       int ret, work_to_do;
52108 +
52109 +       if (dealloc_cons != dealloc_prod)
52110 +               net_tx_action_dealloc();
52111 +
52112 +       mop = tx_map_ops;
52113 +       while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
52114 +               !list_empty(&net_schedule_list)) {
52115 +               /* Get a netif from the list with work to do. */
52116 +               ent = net_schedule_list.next;
52117 +               netif = list_entry(ent, netif_t, list);
52118 +               netif_get(netif);
52119 +               remove_from_net_schedule_list(netif);
52120 +
52121 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
52122 +               if (!work_to_do) {
52123 +                       netif_put(netif);
52124 +                       continue;
52125 +               }
52126 +
52127 +               i = netif->tx.req_cons;
52128 +               rmb(); /* Ensure that we see the request before we copy it. */
52129 +               memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
52130 +               /* Credit-based scheduling. */
52131 +               if (txreq.size > netif->remaining_credit) {
52132 +                       unsigned long now = jiffies;
52133 +                       unsigned long next_credit = 
52134 +                               netif->credit_timeout.expires +
52135 +                               msecs_to_jiffies(netif->credit_usec / 1000);
52136 +
52137 +                       /* Timer could already be pending in rare cases. */
52138 +                       if (timer_pending(&netif->credit_timeout))
52139 +                               break;
52140 +
52141 +                       /* Passed the point where we can replenish credit? */
52142 +                       if (time_after_eq(now, next_credit)) {
52143 +                               netif->credit_timeout.expires = now;
52144 +                               netif->remaining_credit = netif->credit_bytes;
52145 +                       }
52146 +
52147 +                       /* Still too big to send right now? Set a callback. */
52148 +                       if (txreq.size > netif->remaining_credit) {
52149 +                               netif->remaining_credit = 0;
52150 +                               netif->credit_timeout.data     =
52151 +                                       (unsigned long)netif;
52152 +                               netif->credit_timeout.function =
52153 +                                       tx_credit_callback;
52154 +                               __mod_timer(&netif->credit_timeout,
52155 +                                           next_credit);
52156 +                               break;
52157 +                       }
52158 +               }
52159 +               netif->remaining_credit -= txreq.size;
52160 +
52161 +               netif->tx.req_cons++;
52162 +
52163 +               netif_schedule_work(netif);
52164 +
52165 +               if (unlikely(txreq.size < ETH_HLEN) || 
52166 +                   unlikely(txreq.size > ETH_FRAME_LEN)) {
52167 +                       DPRINTK("Bad packet size: %d\n", txreq.size);
52168 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52169 +                       netif_put(netif);
52170 +                       continue; 
52171 +               }
52172 +
52173 +               /* No crossing a page as the payload mustn't fragment. */
52174 +               if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) {
52175 +                       DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
52176 +                               txreq.offset, txreq.size, 
52177 +                               (txreq.offset &~PAGE_MASK) + txreq.size);
52178 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52179 +                       netif_put(netif);
52180 +                       continue;
52181 +               }
52182 +
52183 +               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
52184 +
52185 +               data_len = (txreq.size > PKT_PROT_LEN) ?
52186 +                       PKT_PROT_LEN : txreq.size;
52187 +
52188 +               skb = alloc_skb(data_len+16, GFP_ATOMIC);
52189 +               if (unlikely(skb == NULL)) {
52190 +                       DPRINTK("Can't allocate a skb in start_xmit.\n");
52191 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52192 +                       netif_put(netif);
52193 +                       break;
52194 +               }
52195 +
52196 +               /* Packets passed to netif_rx() must have some headroom. */
52197 +               skb_reserve(skb, 16);
52198 +
52199 +               mop->host_addr = MMAP_VADDR(pending_idx);
52200 +               mop->dom       = netif->domid;
52201 +               mop->ref       = txreq.gref;
52202 +               mop->flags     = GNTMAP_host_map | GNTMAP_readonly;
52203 +               mop++;
52204 +
52205 +               memcpy(&pending_tx_info[pending_idx].req,
52206 +                      &txreq, sizeof(txreq));
52207 +               pending_tx_info[pending_idx].netif = netif;
52208 +               *((u16 *)skb->data) = pending_idx;
52209 +
52210 +               __skb_queue_tail(&tx_queue, skb);
52211 +
52212 +               pending_cons++;
52213 +
52214 +               if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
52215 +                       break;
52216 +       }
52217 +
52218 +       if (mop == tx_map_ops)
52219 +               return;
52220 +
52221 +       ret = HYPERVISOR_grant_table_op(
52222 +               GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
52223 +       BUG_ON(ret);
52224 +
52225 +       mop = tx_map_ops;
52226 +       while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
52227 +               pending_idx = *((u16 *)skb->data);
52228 +               netif       = pending_tx_info[pending_idx].netif;
52229 +               memcpy(&txreq, &pending_tx_info[pending_idx].req,
52230 +                      sizeof(txreq));
52231 +
52232 +               /* Check the remap error code. */
52233 +               if (unlikely(mop->status)) {
52234 +                       printk(KERN_ALERT "#### netback grant fails\n");
52235 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52236 +                       netif_put(netif);
52237 +                       kfree_skb(skb);
52238 +                       mop++;
52239 +                       pending_ring[MASK_PEND_IDX(pending_prod++)] =
52240 +                               pending_idx;
52241 +                       continue;
52242 +               }
52243 +               set_phys_to_machine(
52244 +                       __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
52245 +                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
52246 +               grant_tx_handle[pending_idx] = mop->handle;
52247 +
52248 +               data_len = (txreq.size > PKT_PROT_LEN) ?
52249 +                       PKT_PROT_LEN : txreq.size;
52250 +
52251 +               __skb_put(skb, data_len);
52252 +               memcpy(skb->data, 
52253 +                      (void *)(MMAP_VADDR(pending_idx)|txreq.offset),
52254 +                      data_len);
52255 +               if (data_len < txreq.size) {
52256 +                       /* Append the packet payload as a fragment. */
52257 +                       skb_shinfo(skb)->frags[0].page        = 
52258 +                               virt_to_page(MMAP_VADDR(pending_idx));
52259 +                       skb_shinfo(skb)->frags[0].size        =
52260 +                               txreq.size - data_len;
52261 +                       skb_shinfo(skb)->frags[0].page_offset = 
52262 +                               txreq.offset + data_len;
52263 +                       skb_shinfo(skb)->nr_frags = 1;
52264 +               } else {
52265 +                       /* Schedule a response immediately. */
52266 +                       netif_idx_release(pending_idx);
52267 +               }
52268 +
52269 +               skb->data_len  = txreq.size - data_len;
52270 +               skb->len      += skb->data_len;
52271 +
52272 +               skb->dev      = netif->dev;
52273 +               skb->protocol = eth_type_trans(skb, skb->dev);
52274 +
52275 +               /*
52276 +                * Old frontends do not assert data_validated but we
52277 +                * can infer it from csum_blank so test both flags.
52278 +                */
52279 +               if (txreq.flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
52280 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
52281 +                       skb->proto_data_valid = 1;
52282 +               } else {
52283 +                       skb->ip_summed = CHECKSUM_NONE;
52284 +                       skb->proto_data_valid = 0;
52285 +               }
52286 +               skb->proto_csum_blank = !!(txreq.flags & NETTXF_csum_blank);
52287 +
52288 +               netif->stats.rx_bytes += txreq.size;
52289 +               netif->stats.rx_packets++;
52290 +
52291 +               netif_rx(skb);
52292 +               netif->dev->last_rx = jiffies;
52293 +
52294 +               mop++;
52295 +       }
52296 +}
52297 +
52298 +static void netif_idx_release(u16 pending_idx)
52299 +{
52300 +       static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
52301 +       unsigned long flags;
52302 +
52303 +       spin_lock_irqsave(&_lock, flags);
52304 +       dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
52305 +       spin_unlock_irqrestore(&_lock, flags);
52306 +
52307 +       tasklet_schedule(&net_tx_tasklet);
52308 +}
52309 +
52310 +static void netif_page_release(struct page *page)
52311 +{
52312 +       u16 pending_idx = page - virt_to_page(mmap_vstart);
52313 +
52314 +       /* Ready for next use. */
52315 +       set_page_count(page, 1);
52316 +
52317 +       netif_idx_release(pending_idx);
52318 +}
52319 +
52320 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
52321 +{
52322 +       netif_t *netif = dev_id;
52323 +       add_to_net_schedule_list_tail(netif);
52324 +       maybe_schedule_tx_action();
52325 +       return IRQ_HANDLED;
52326 +}
52327 +
52328 +static void make_tx_response(netif_t *netif, 
52329 +                             u16      id,
52330 +                             s8       st)
52331 +{
52332 +       RING_IDX i = netif->tx.rsp_prod_pvt;
52333 +       netif_tx_response_t *resp;
52334 +       int notify;
52335 +
52336 +       resp = RING_GET_RESPONSE(&netif->tx, i);
52337 +       resp->id     = id;
52338 +       resp->status = st;
52339 +
52340 +       netif->tx.rsp_prod_pvt = ++i;
52341 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
52342 +       if (notify)
52343 +               notify_remote_via_irq(netif->irq);
52344 +
52345 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
52346 +       if (i == netif->tx.req_cons) {
52347 +               int more_to_do;
52348 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
52349 +               if (more_to_do)
52350 +                       add_to_net_schedule_list_tail(netif);
52351 +       }
52352 +#endif
52353 +}
52354 +
52355 +static int make_rx_response(netif_t *netif, 
52356 +                            u16      id, 
52357 +                            s8       st,
52358 +                            u16      offset,
52359 +                            u16      size,
52360 +                            u16      flags)
52361 +{
52362 +       RING_IDX i = netif->rx.rsp_prod_pvt;
52363 +       netif_rx_response_t *resp;
52364 +       int notify;
52365 +
52366 +       resp = RING_GET_RESPONSE(&netif->rx, i);
52367 +       resp->offset     = offset;
52368 +       resp->flags      = flags;
52369 +       resp->id         = id;
52370 +       resp->status     = (s16)size;
52371 +       if (st < 0)
52372 +               resp->status = (s16)st;
52373 +
52374 +       netif->rx.rsp_prod_pvt = ++i;
52375 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, notify);
52376 +
52377 +       return notify;
52378 +}
52379 +
52380 +#ifdef NETBE_DEBUG_INTERRUPT
52381 +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
52382 +{
52383 +       struct list_head *ent;
52384 +       netif_t *netif;
52385 +       int i = 0;
52386 +
52387 +       printk(KERN_ALERT "netif_schedule_list:\n");
52388 +       spin_lock_irq(&net_schedule_list_lock);
52389 +
52390 +       list_for_each (ent, &net_schedule_list) {
52391 +               netif = list_entry(ent, netif_t, list);
52392 +               printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
52393 +                      "rx_resp_prod=%08x\n",
52394 +                      i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
52395 +               printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
52396 +                      netif->tx.req_cons, netif->tx.rsp_prod_pvt);
52397 +               printk(KERN_ALERT "   shared(rx_req_prod=%08x "
52398 +                      "rx_resp_prod=%08x\n",
52399 +                      netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
52400 +               printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
52401 +                      netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
52402 +               printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
52403 +                      netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
52404 +               i++;
52405 +       }
52406 +
52407 +       spin_unlock_irq(&net_schedule_list_lock);
52408 +       printk(KERN_ALERT " ** End of netif_schedule_list **\n");
52409 +
52410 +       return IRQ_HANDLED;
52411 +}
52412 +#endif
52413 +
52414 +static int __init netback_init(void)
52415 +{
52416 +       int i;
52417 +       struct page *page;
52418 +
52419 +       /* We can increase reservation by this much in net_rx_action(). */
52420 +       balloon_update_driver_allowance(NET_RX_RING_SIZE);
52421 +
52422 +       skb_queue_head_init(&rx_queue);
52423 +       skb_queue_head_init(&tx_queue);
52424 +
52425 +       init_timer(&net_timer);
52426 +       net_timer.data = 0;
52427 +       net_timer.function = net_alarm;
52428 +    
52429 +       page = balloon_alloc_empty_page_range(MAX_PENDING_REQS);
52430 +       BUG_ON(page == NULL);
52431 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
52432 +
52433 +       for (i = 0; i < MAX_PENDING_REQS; i++) {
52434 +               page = virt_to_page(MMAP_VADDR(i));
52435 +               set_page_count(page, 1);
52436 +               SetPageForeign(page, netif_page_release);
52437 +       }
52438 +
52439 +       pending_cons = 0;
52440 +       pending_prod = MAX_PENDING_REQS;
52441 +       for (i = 0; i < MAX_PENDING_REQS; i++)
52442 +               pending_ring[i] = i;
52443 +
52444 +       spin_lock_init(&net_schedule_list_lock);
52445 +       INIT_LIST_HEAD(&net_schedule_list);
52446 +
52447 +       netif_xenbus_init();
52448 +
52449 +#ifdef NETBE_DEBUG_INTERRUPT
52450 +       (void)bind_virq_to_irqhandler(
52451 +               VIRQ_DEBUG,
52452 +               0,
52453 +               netif_be_dbg,
52454 +               SA_SHIRQ, 
52455 +               "net-be-dbg",
52456 +               &netif_be_dbg);
52457 +#endif
52458 +
52459 +       __unsafe(THIS_MODULE);
52460 +
52461 +       return 0;
52462 +}
52463 +
52464 +static void netback_cleanup(void)
52465 +{
52466 +       BUG();
52467 +}
52468 +
52469 +module_init(netback_init);
52470 +module_exit(netback_cleanup);
52471 +
52472 +MODULE_LICENSE("Dual BSD/GPL");
52473 +
52474 +/*
52475 + * Local variables:
52476 + *  c-file-style: "linux"
52477 + *  indent-tabs-mode: t
52478 + *  c-indent-level: 8
52479 + *  c-basic-offset: 8
52480 + *  tab-width: 8
52481 + * End:
52482 + */
52483 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netback/xenbus.c tmp-linux-2.6-xen.patch/drivers/xen/netback/xenbus.c
52484 --- ref-linux-2.6.16.9/drivers/xen/netback/xenbus.c     1970-01-01 01:00:00.000000000 +0100
52485 +++ tmp-linux-2.6-xen.patch/drivers/xen/netback/xenbus.c        2006-04-10 00:05:52.000000000 +0200
52486 @@ -0,0 +1,366 @@
52487 +/*  Xenbus code for netif backend
52488 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
52489 +    Copyright (C) 2005 XenSource Ltd
52490 +
52491 +    This program is free software; you can redistribute it and/or modify
52492 +    it under the terms of the GNU General Public License as published by
52493 +    the Free Software Foundation; either version 2 of the License, or
52494 +    (at your option) any later version.
52495 +
52496 +    This program is distributed in the hope that it will be useful,
52497 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
52498 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
52499 +    GNU General Public License for more details.
52500 +
52501 +    You should have received a copy of the GNU General Public License
52502 +    along with this program; if not, write to the Free Software
52503 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
52504 +*/
52505 +
52506 +
52507 +#include <stdarg.h>
52508 +#include <linux/module.h>
52509 +#include <xen/xenbus.h>
52510 +#include <xen/net_driver_util.h>
52511 +#include "common.h"
52512 +
52513 +
52514 +#if 0
52515 +#undef DPRINTK
52516 +#define DPRINTK(fmt, args...) \
52517 +    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
52518 +#endif
52519 +
52520 +
52521 +struct backend_info
52522 +{
52523 +       struct xenbus_device *dev;
52524 +       netif_t *netif;
52525 +       struct xenbus_watch backend_watch;
52526 +       XenbusState frontend_state;
52527 +};
52528 +
52529 +
52530 +static int connect_rings(struct backend_info *);
52531 +static void connect(struct backend_info *);
52532 +static void maybe_connect(struct backend_info *);
52533 +static void backend_changed(struct xenbus_watch *, const char **,
52534 +                           unsigned int);
52535 +
52536 +
52537 +static int netback_remove(struct xenbus_device *dev)
52538 +{
52539 +       struct backend_info *be = dev->data;
52540 +
52541 +       if (be->backend_watch.node) {
52542 +               unregister_xenbus_watch(&be->backend_watch);
52543 +               kfree(be->backend_watch.node);
52544 +               be->backend_watch.node = NULL;
52545 +       }
52546 +       if (be->netif) {
52547 +               netif_disconnect(be->netif);
52548 +               be->netif = NULL;
52549 +       }
52550 +       kfree(be);
52551 +       dev->data = NULL;
52552 +       return 0;
52553 +}
52554 +
52555 +
52556 +/**
52557 + * Entry point to this code when a new device is created.  Allocate the basic
52558 + * structures, and watch the store waiting for the hotplug scripts to tell us
52559 + * the device's handle.  Switch to InitWait.
52560 + */
52561 +static int netback_probe(struct xenbus_device *dev,
52562 +                        const struct xenbus_device_id *id)
52563 +{
52564 +       int err;
52565 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
52566 +                                         GFP_KERNEL);
52567 +       if (!be) {
52568 +               xenbus_dev_fatal(dev, -ENOMEM,
52569 +                                "allocating backend structure");
52570 +               return -ENOMEM;
52571 +       }
52572 +
52573 +       be->dev = dev;
52574 +       dev->data = be;
52575 +
52576 +       err = xenbus_watch_path2(dev, dev->nodename, "handle",
52577 +                                &be->backend_watch, backend_changed);
52578 +       if (err)
52579 +               goto fail;
52580 +
52581 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
52582 +       if (err) {
52583 +               goto fail;
52584 +       }
52585 +
52586 +       return 0;
52587 +
52588 +fail:
52589 +       DPRINTK("failed");
52590 +       netback_remove(dev);
52591 +       return err;
52592 +}
52593 +
52594 +
52595 +/**
52596 + * Handle the creation of the hotplug script environment.  We add the script
52597 + * and vif variables to the environment, for the benefit of the vif-* hotplug
52598 + * scripts.
52599 + */
52600 +static int netback_uevent(struct xenbus_device *xdev, char **envp,
52601 +                         int num_envp, char *buffer, int buffer_size)
52602 +{
52603 +       struct backend_info *be = xdev->data;
52604 +       netif_t *netif = be->netif;
52605 +       int i = 0, length = 0;
52606 +       char *val;
52607 +
52608 +       DPRINTK("netback_uevent");
52609 +
52610 +       val = xenbus_read(XBT_NULL, xdev->nodename, "script", NULL);
52611 +       if (IS_ERR(val)) {
52612 +               int err = PTR_ERR(val);
52613 +               xenbus_dev_fatal(xdev, err, "reading script");
52614 +               return err;
52615 +       }
52616 +       else {
52617 +               add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
52618 +                              &length, "script=%s", val);
52619 +               kfree(val);
52620 +       }
52621 +
52622 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
52623 +                      "vif=%s", netif->dev->name);
52624 +
52625 +       envp[i] = NULL;
52626 +
52627 +       return 0;
52628 +}
52629 +
52630 +
52631 +/**
52632 + * Callback received when the hotplug scripts have placed the handle node.
52633 + * Read it, and create a netif structure.  If the frontend is ready, connect.
52634 + */
52635 +static void backend_changed(struct xenbus_watch *watch,
52636 +                           const char **vec, unsigned int len)
52637 +{
52638 +       int err;
52639 +       long handle;
52640 +       struct backend_info *be
52641 +               = container_of(watch, struct backend_info, backend_watch);
52642 +       struct xenbus_device *dev = be->dev;
52643 +
52644 +       DPRINTK("");
52645 +
52646 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "handle", "%li", &handle);
52647 +       if (XENBUS_EXIST_ERR(err)) {
52648 +               /* Since this watch will fire once immediately after it is
52649 +                  registered, we expect this.  Ignore it, and wait for the
52650 +                  hotplug scripts. */
52651 +               return;
52652 +       }
52653 +       if (err != 1) {
52654 +               xenbus_dev_fatal(dev, err, "reading handle");
52655 +               return;
52656 +       }
52657 +
52658 +       if (be->netif == NULL) {
52659 +               u8 be_mac[ETH_ALEN] = { 0, 0, 0, 0, 0, 0 };
52660 +
52661 +               be->netif = alloc_netif(dev->otherend_id, handle, be_mac);
52662 +               if (IS_ERR(be->netif)) {
52663 +                       err = PTR_ERR(be->netif);
52664 +                       be->netif = NULL;
52665 +                       xenbus_dev_fatal(dev, err, "creating interface");
52666 +                       return;
52667 +               }
52668 +
52669 +               kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
52670 +
52671 +               maybe_connect(be);
52672 +       }
52673 +}
52674 +
52675 +
52676 +/**
52677 + * Callback received when the frontend's state changes.
52678 + */
52679 +static void frontend_changed(struct xenbus_device *dev,
52680 +                            XenbusState frontend_state)
52681 +{
52682 +       struct backend_info *be = dev->data;
52683 +
52684 +       DPRINTK("");
52685 +
52686 +       be->frontend_state = frontend_state;
52687 +
52688 +       switch (frontend_state) {
52689 +       case XenbusStateInitialising:
52690 +       case XenbusStateInitialised:
52691 +               break;
52692 +
52693 +       case XenbusStateConnected:
52694 +               maybe_connect(be);
52695 +               break;
52696 +
52697 +       case XenbusStateClosing:
52698 +               xenbus_switch_state(dev, XenbusStateClosing);
52699 +               break;
52700 +
52701 +       case XenbusStateClosed:
52702 +               if (be->netif != NULL)
52703 +                       kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
52704 +               device_unregister(&dev->dev);
52705 +               break;
52706 +
52707 +       case XenbusStateUnknown:
52708 +       case XenbusStateInitWait:
52709 +       default:
52710 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
52711 +                                frontend_state);
52712 +               break;
52713 +       }
52714 +}
52715 +
52716 +
52717 +/* ** Connection ** */
52718 +
52719 +
52720 +static void maybe_connect(struct backend_info *be)
52721 +{
52722 +       if (be->netif && (be->frontend_state == XenbusStateConnected))
52723 +               connect(be);
52724 +}
52725 +
52726 +static void xen_net_read_rate(struct xenbus_device *dev,
52727 +                             unsigned long *bytes, unsigned long *usec)
52728 +{
52729 +       char *s, *e;
52730 +       unsigned long b, u;
52731 +       char *ratestr;
52732 +
52733 +       /* Default to unlimited bandwidth. */
52734 +       *bytes = ~0UL;
52735 +       *usec = 0;
52736 +
52737 +       ratestr = xenbus_read(XBT_NULL, dev->nodename, "rate", NULL);
52738 +       if (IS_ERR(ratestr))
52739 +               return;
52740 +
52741 +       s = ratestr;
52742 +       b = simple_strtoul(s, &e, 10);
52743 +       if ((s == e) || (*e != ','))
52744 +               goto fail;
52745 +
52746 +       s = e + 1;
52747 +       u = simple_strtoul(s, &e, 10);
52748 +       if ((s == e) || (*e != '\0'))
52749 +               goto fail;
52750 +
52751 +       *bytes = b;
52752 +       *usec = u;
52753 +
52754 +       kfree(ratestr);
52755 +       return;
52756 +
52757 + fail:
52758 +       WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
52759 +       kfree(ratestr);
52760 +}
52761 +
52762 +
52763 +static void connect(struct backend_info *be)
52764 +{
52765 +       int err;
52766 +       struct xenbus_device *dev = be->dev;
52767 +
52768 +       err = connect_rings(be);
52769 +       if (err)
52770 +               return;
52771 +
52772 +       err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
52773 +       if (err) {
52774 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
52775 +               return;
52776 +       }
52777 +
52778 +       xen_net_read_rate(dev, &be->netif->credit_bytes,
52779 +                         &be->netif->credit_usec);
52780 +       be->netif->remaining_credit = be->netif->credit_bytes;
52781 +
52782 +       xenbus_switch_state(dev, XenbusStateConnected);
52783 +}
52784 +
52785 +
52786 +static int connect_rings(struct backend_info *be)
52787 +{
52788 +       struct xenbus_device *dev = be->dev;
52789 +       unsigned long tx_ring_ref, rx_ring_ref;
52790 +       unsigned int evtchn;
52791 +       int err;
52792 +
52793 +       DPRINTK("");
52794 +
52795 +       err = xenbus_gather(XBT_NULL, dev->otherend,
52796 +                           "tx-ring-ref", "%lu", &tx_ring_ref,
52797 +                           "rx-ring-ref", "%lu", &rx_ring_ref,
52798 +                           "event-channel", "%u", &evtchn, NULL);
52799 +       if (err) {
52800 +               xenbus_dev_fatal(dev, err,
52801 +                                "reading %s/ring-ref and event-channel",
52802 +                                dev->otherend);
52803 +               return err;
52804 +       }
52805 +
52806 +       /* Map the shared frame, irq etc. */
52807 +       err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
52808 +       if (err) {
52809 +               xenbus_dev_fatal(dev, err,
52810 +                                "mapping shared-frames %lu/%lu port %u",
52811 +                                tx_ring_ref, rx_ring_ref, evtchn);
52812 +               return err;
52813 +       }
52814 +       return 0;
52815 +}
52816 +
52817 +
52818 +/* ** Driver Registration ** */
52819 +
52820 +
52821 +static struct xenbus_device_id netback_ids[] = {
52822 +       { "vif" },
52823 +       { "" }
52824 +};
52825 +
52826 +
52827 +static struct xenbus_driver netback = {
52828 +       .name = "vif",
52829 +       .owner = THIS_MODULE,
52830 +       .ids = netback_ids,
52831 +       .probe = netback_probe,
52832 +       .remove = netback_remove,
52833 +       .uevent = netback_uevent,
52834 +       .otherend_changed = frontend_changed,
52835 +};
52836 +
52837 +
52838 +void netif_xenbus_init(void)
52839 +{
52840 +       xenbus_register_backend(&netback);
52841 +}
52842 +
52843 +
52844 +/*
52845 + * Local variables:
52846 + *  c-file-style: "linux"
52847 + *  indent-tabs-mode: t
52848 + *  c-indent-level: 8
52849 + *  c-basic-offset: 8
52850 + *  tab-width: 8
52851 + * End:
52852 + */
52853 diff -Nurp ref-linux-2.6.16.9/drivers/xen/net_driver_util.c tmp-linux-2.6-xen.patch/drivers/xen/net_driver_util.c
52854 --- ref-linux-2.6.16.9/drivers/xen/net_driver_util.c    1970-01-01 01:00:00.000000000 +0100
52855 +++ tmp-linux-2.6-xen.patch/drivers/xen/net_driver_util.c       2006-04-10 00:05:52.000000000 +0200
52856 @@ -0,0 +1,68 @@
52857 +/*****************************************************************************
52858 + *
52859 + * Utility functions for Xen network devices.
52860 + *
52861 + * Copyright (c) 2005 XenSource Ltd.
52862 + * 
52863 + * This program is free software; you can redistribute it and/or
52864 + * modify it under the terms of the GNU General Public License version 2
52865 + * as published by the Free Software Foundation; or, when distributed
52866 + * separately from the Linux kernel or incorporated into other
52867 + * software packages, subject to the following license:
52868 + * 
52869 + * Permission is hereby granted, free of charge, to any person obtaining a
52870 + * copy of this source file (the "Software"), to deal in the Software without
52871 + * restriction, including without limitation the rights to use, copy, modify,
52872 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
52873 + * and to permit persons to whom the Software is furnished to do so, subject
52874 + * to the following conditions:
52875 + * 
52876 + * The above copyright notice and this permission notice shall be included in
52877 + * all copies or substantial portions of the Software.
52878 + * 
52879 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
52880 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
52881 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52882 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52883 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
52884 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
52885 + * DEALINGS IN THE SOFTWARE.
52886 + */
52887 +
52888 +#include <linux/if_ether.h>
52889 +#include <linux/err.h>
52890 +#include <linux/module.h>
52891 +#include <xen/net_driver_util.h>
52892 +
52893 +
52894 +int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
52895 +{
52896 +       char *s;
52897 +       int i;
52898 +       char *e;
52899 +       char *macstr = xenbus_read(XBT_NULL, dev->nodename, "mac", NULL);
52900 +       if (IS_ERR(macstr))
52901 +               return PTR_ERR(macstr);
52902 +       s = macstr;
52903 +       for (i = 0; i < ETH_ALEN; i++) {
52904 +               mac[i] = simple_strtoul(s, &e, 16);
52905 +               if (s == e || (e[0] != ':' && e[0] != 0)) {
52906 +                       kfree(macstr);
52907 +                       return -ENOENT;
52908 +               }
52909 +               s = &e[1];
52910 +       }
52911 +       kfree(macstr);
52912 +       return 0;
52913 +}
52914 +EXPORT_SYMBOL_GPL(xen_net_read_mac);
52915 +
52916 +/*
52917 + * Local variables:
52918 + *  c-file-style: "linux"
52919 + *  indent-tabs-mode: t
52920 + *  c-indent-level: 8
52921 + *  c-basic-offset: 8
52922 + *  tab-width: 8
52923 + * End:
52924 + */
52925 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netfront/Makefile tmp-linux-2.6-xen.patch/drivers/xen/netfront/Makefile
52926 --- ref-linux-2.6.16.9/drivers/xen/netfront/Makefile    1970-01-01 01:00:00.000000000 +0100
52927 +++ tmp-linux-2.6-xen.patch/drivers/xen/netfront/Makefile       2006-04-10 00:05:52.000000000 +0200
52928 @@ -0,0 +1,4 @@
52929 +
52930 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      := xennet.o
52931 +
52932 +xennet-objs := netfront.o
52933 diff -Nurp ref-linux-2.6.16.9/drivers/xen/netfront/netfront.c tmp-linux-2.6-xen.patch/drivers/xen/netfront/netfront.c
52934 --- ref-linux-2.6.16.9/drivers/xen/netfront/netfront.c  1970-01-01 01:00:00.000000000 +0100
52935 +++ tmp-linux-2.6-xen.patch/drivers/xen/netfront/netfront.c     2006-04-10 00:05:52.000000000 +0200
52936 @@ -0,0 +1,1524 @@
52937 +/******************************************************************************
52938 + * Virtual network driver for conversing with remote driver backends.
52939 + * 
52940 + * Copyright (c) 2002-2005, K A Fraser
52941 + * Copyright (c) 2005, XenSource Ltd
52942 + * 
52943 + * This program is free software; you can redistribute it and/or
52944 + * modify it under the terms of the GNU General Public License version 2
52945 + * as published by the Free Software Foundation; or, when distributed
52946 + * separately from the Linux kernel or incorporated into other
52947 + * software packages, subject to the following license:
52948 + * 
52949 + * Permission is hereby granted, free of charge, to any person obtaining a copy
52950 + * of this source file (the "Software"), to deal in the Software without
52951 + * restriction, including without limitation the rights to use, copy, modify,
52952 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
52953 + * and to permit persons to whom the Software is furnished to do so, subject to
52954 + * the following conditions:
52955 + * 
52956 + * The above copyright notice and this permission notice shall be included in
52957 + * all copies or substantial portions of the Software.
52958 + * 
52959 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
52960 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
52961 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52962 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52963 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
52964 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
52965 + * IN THE SOFTWARE.
52966 + */
52967 +
52968 +#include <linux/config.h>
52969 +#include <linux/module.h>
52970 +#include <linux/version.h>
52971 +#include <linux/kernel.h>
52972 +#include <linux/sched.h>
52973 +#include <linux/slab.h>
52974 +#include <linux/string.h>
52975 +#include <linux/errno.h>
52976 +#include <linux/netdevice.h>
52977 +#include <linux/inetdevice.h>
52978 +#include <linux/etherdevice.h>
52979 +#include <linux/skbuff.h>
52980 +#include <linux/init.h>
52981 +#include <linux/bitops.h>
52982 +#include <linux/proc_fs.h>
52983 +#include <linux/ethtool.h>
52984 +#include <linux/in.h>
52985 +#include <net/sock.h>
52986 +#include <net/pkt_sched.h>
52987 +#include <net/arp.h>
52988 +#include <net/route.h>
52989 +#include <asm/io.h>
52990 +#include <asm/uaccess.h>
52991 +#include <xen/evtchn.h>
52992 +#include <xen/xenbus.h>
52993 +#include <xen/interface/io/netif.h>
52994 +#include <xen/interface/memory.h>
52995 +#include <xen/balloon.h>
52996 +#include <asm/page.h>
52997 +#include <asm/uaccess.h>
52998 +#include <xen/interface/grant_table.h>
52999 +#include <xen/gnttab.h>
53000 +#include <xen/net_driver_util.h>
53001 +
53002 +#define GRANT_INVALID_REF      0
53003 +
53004 +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
53005 +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
53006 +
53007 +static inline void init_skb_shinfo(struct sk_buff *skb)
53008 +{
53009 +       atomic_set(&(skb_shinfo(skb)->dataref), 1);
53010 +       skb_shinfo(skb)->nr_frags = 0;
53011 +       skb_shinfo(skb)->frag_list = NULL;
53012 +}
53013 +
53014 +struct netfront_info
53015 +{
53016 +       struct list_head list;
53017 +       struct net_device *netdev;
53018 +
53019 +       struct net_device_stats stats;
53020 +       unsigned int tx_full;
53021 +
53022 +       netif_tx_front_ring_t tx;
53023 +       netif_rx_front_ring_t rx;
53024 +
53025 +       spinlock_t   tx_lock;
53026 +       spinlock_t   rx_lock;
53027 +
53028 +       unsigned int handle;
53029 +       unsigned int evtchn, irq;
53030 +
53031 +       /* What is the status of our connection to the remote backend? */
53032 +#define BEST_CLOSED       0
53033 +#define BEST_DISCONNECTED 1
53034 +#define BEST_CONNECTED    2
53035 +       unsigned int backend_state;
53036 +
53037 +       /* Is this interface open or closed (down or up)? */
53038 +#define UST_CLOSED        0
53039 +#define UST_OPEN          1
53040 +       unsigned int user_state;
53041 +
53042 +       /* Receive-ring batched refills. */
53043 +#define RX_MIN_TARGET 8
53044 +#define RX_DFL_MIN_TARGET 64
53045 +#define RX_MAX_TARGET NET_RX_RING_SIZE
53046 +       int rx_min_target, rx_max_target, rx_target;
53047 +       struct sk_buff_head rx_batch;
53048 +
53049 +       struct timer_list rx_refill_timer;
53050 +
53051 +       /*
53052 +        * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
53053 +        * array is an index into a chain of free entries.
53054 +        */
53055 +       struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
53056 +       struct sk_buff *rx_skbs[NET_RX_RING_SIZE+1];
53057 +
53058 +       grant_ref_t gref_tx_head;
53059 +       grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
53060 +       grant_ref_t gref_rx_head;
53061 +       grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1];
53062 +
53063 +       struct xenbus_device *xbdev;
53064 +       int tx_ring_ref;
53065 +       int rx_ring_ref;
53066 +       u8 mac[ETH_ALEN];
53067 +
53068 +       unsigned long rx_pfn_array[NET_RX_RING_SIZE];
53069 +       multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
53070 +       mmu_update_t rx_mmu[NET_RX_RING_SIZE];
53071 +};
53072 +
53073 +/*
53074 + * Access macros for acquiring freeing slots in {tx,rx}_skbs[].
53075 + */
53076 +
53077 +static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
53078 +{
53079 +       list[id] = list[0];
53080 +       list[0]  = (void *)(unsigned long)id;
53081 +}
53082 +
53083 +static inline unsigned short get_id_from_freelist(struct sk_buff **list)
53084 +{
53085 +       unsigned int id = (unsigned int)(unsigned long)list[0];
53086 +       list[0] = list[id];
53087 +       return id;
53088 +}
53089 +
53090 +#ifdef DEBUG
53091 +static char *be_state_name[] = {
53092 +       [BEST_CLOSED]       = "closed",
53093 +       [BEST_DISCONNECTED] = "disconnected",
53094 +       [BEST_CONNECTED]    = "connected",
53095 +};
53096 +#endif
53097 +
53098 +#define DPRINTK(fmt, args...) pr_debug("netfront (%s:%d) " fmt, \
53099 +                                       __FUNCTION__, __LINE__, ##args)
53100 +#define IPRINTK(fmt, args...)                          \
53101 +       printk(KERN_INFO "netfront: " fmt, ##args)
53102 +#define WPRINTK(fmt, args...)                          \
53103 +       printk(KERN_WARNING "netfront: " fmt, ##args)
53104 +
53105 +
53106 +static int talk_to_backend(struct xenbus_device *, struct netfront_info *);
53107 +static int setup_device(struct xenbus_device *, struct netfront_info *);
53108 +static int create_netdev(int, struct xenbus_device *, struct net_device **);
53109 +
53110 +static void netfront_closing(struct xenbus_device *);
53111 +
53112 +static void end_access(int, void *);
53113 +static void netif_disconnect_backend(struct netfront_info *);
53114 +static void close_netdev(struct netfront_info *);
53115 +static void netif_free(struct netfront_info *);
53116 +
53117 +static void show_device(struct netfront_info *);
53118 +
53119 +static void network_connect(struct net_device *);
53120 +static void network_tx_buf_gc(struct net_device *);
53121 +static void network_alloc_rx_buffers(struct net_device *);
53122 +static int send_fake_arp(struct net_device *);
53123 +
53124 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
53125 +
53126 +#ifdef CONFIG_PROC_FS
53127 +static int xennet_proc_init(void);
53128 +static int xennet_proc_addif(struct net_device *dev);
53129 +static void xennet_proc_delif(struct net_device *dev);
53130 +#else
53131 +#define xennet_proc_init()   (0)
53132 +#define xennet_proc_addif(d) (0)
53133 +#define xennet_proc_delif(d) ((void)0)
53134 +#endif
53135 +
53136 +
53137 +/**
53138 + * Entry point to this code when a new device is created.  Allocate the basic
53139 + * structures and the ring buffers for communication with the backend, and
53140 + * inform the backend of the appropriate details for those.  Switch to
53141 + * Connected state.
53142 + */
53143 +static int netfront_probe(struct xenbus_device *dev,
53144 +                         const struct xenbus_device_id *id)
53145 +{
53146 +       int err;
53147 +       struct net_device *netdev;
53148 +       struct netfront_info *info;
53149 +       unsigned int handle;
53150 +
53151 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "handle", "%u", &handle);
53152 +       if (err != 1) {
53153 +               xenbus_dev_fatal(dev, err, "reading handle");
53154 +               return err;
53155 +       }
53156 +
53157 +       err = create_netdev(handle, dev, &netdev);
53158 +       if (err) {
53159 +               xenbus_dev_fatal(dev, err, "creating netdev");
53160 +               return err;
53161 +       }
53162 +
53163 +       info = netdev_priv(netdev);
53164 +       dev->data = info;
53165 +
53166 +       err = talk_to_backend(dev, info);
53167 +       if (err) {
53168 +               kfree(info);
53169 +               dev->data = NULL;
53170 +               return err;
53171 +       }
53172 +
53173 +       return 0;
53174 +}
53175 +
53176 +
53177 +/**
53178 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
53179 + * driver restart.  We tear down our netif structure and recreate it, but
53180 + * leave the device-layer structures intact so that this is transparent to the
53181 + * rest of the kernel.
53182 + */
53183 +static int netfront_resume(struct xenbus_device *dev)
53184 +{
53185 +       struct netfront_info *info = dev->data;
53186 +
53187 +       DPRINTK("%s\n", dev->nodename);
53188 +
53189 +       netif_disconnect_backend(info);
53190 +       return talk_to_backend(dev, info);
53191 +}
53192 +
53193 +
53194 +/* Common code used when first setting up, and when resuming. */
53195 +static int talk_to_backend(struct xenbus_device *dev,
53196 +                          struct netfront_info *info)
53197 +{
53198 +       const char *message;
53199 +       xenbus_transaction_t xbt;
53200 +       int err;
53201 +
53202 +       err = xen_net_read_mac(dev, info->mac);
53203 +       if (err) {
53204 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
53205 +               goto out;
53206 +       }
53207 +
53208 +       /* Create shared ring, alloc event channel. */
53209 +       err = setup_device(dev, info);
53210 +       if (err)
53211 +               goto out;
53212 +
53213 +again:
53214 +       err = xenbus_transaction_start(&xbt);
53215 +       if (err) {
53216 +               xenbus_dev_fatal(dev, err, "starting transaction");
53217 +               goto destroy_ring;
53218 +       }
53219 +
53220 +       err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
53221 +                           info->tx_ring_ref);
53222 +       if (err) {
53223 +               message = "writing tx ring-ref";
53224 +               goto abort_transaction;
53225 +       }
53226 +       err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
53227 +                           info->rx_ring_ref);
53228 +       if (err) {
53229 +               message = "writing rx ring-ref";
53230 +               goto abort_transaction;
53231 +       }
53232 +       err = xenbus_printf(xbt, dev->nodename,
53233 +                           "event-channel", "%u", info->evtchn);
53234 +       if (err) {
53235 +               message = "writing event-channel";
53236 +               goto abort_transaction;
53237 +       }
53238 +
53239 +       err = xenbus_printf(xbt, dev->nodename,
53240 +                           "state", "%d", XenbusStateConnected);
53241 +       if (err) {
53242 +               message = "writing frontend XenbusStateConnected";
53243 +               goto abort_transaction;
53244 +       }
53245 +
53246 +       err = xenbus_transaction_end(xbt, 0);
53247 +       if (err) {
53248 +               if (err == -EAGAIN)
53249 +                       goto again;
53250 +               xenbus_dev_fatal(dev, err, "completing transaction");
53251 +               goto destroy_ring;
53252 +       }
53253 +
53254 +       return 0;
53255 +
53256 + abort_transaction:
53257 +       xenbus_transaction_end(xbt, 1);
53258 +       xenbus_dev_fatal(dev, err, "%s", message);
53259 + destroy_ring:
53260 +       netif_free(info);
53261 + out:
53262 +       return err;
53263 +}
53264 +
53265 +
53266 +static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
53267 +{
53268 +       netif_tx_sring_t *txs;
53269 +       netif_rx_sring_t *rxs;
53270 +       int err;
53271 +       struct net_device *netdev = info->netdev;
53272 +
53273 +       info->tx_ring_ref = GRANT_INVALID_REF;
53274 +       info->rx_ring_ref = GRANT_INVALID_REF;
53275 +       info->rx.sring = NULL;
53276 +       info->tx.sring = NULL;
53277 +       info->irq = 0;
53278 +
53279 +       txs = (netif_tx_sring_t *)__get_free_page(GFP_KERNEL);
53280 +       if (!txs) {
53281 +               err = -ENOMEM;
53282 +               xenbus_dev_fatal(dev, err, "allocating tx ring page");
53283 +               goto fail;
53284 +       }
53285 +       rxs = (netif_rx_sring_t *)__get_free_page(GFP_KERNEL);
53286 +       if (!rxs) {
53287 +               err = -ENOMEM;
53288 +               xenbus_dev_fatal(dev, err, "allocating rx ring page");
53289 +               goto fail;
53290 +       }
53291 +       memset(txs, 0, PAGE_SIZE);
53292 +       memset(rxs, 0, PAGE_SIZE);
53293 +       info->backend_state = BEST_DISCONNECTED;
53294 +
53295 +       SHARED_RING_INIT(txs);
53296 +       FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
53297 +
53298 +       SHARED_RING_INIT(rxs);
53299 +       FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
53300 +
53301 +       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
53302 +       if (err < 0)
53303 +               goto fail;
53304 +       info->tx_ring_ref = err;
53305 +
53306 +       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
53307 +       if (err < 0)
53308 +               goto fail;
53309 +       info->rx_ring_ref = err;
53310 +
53311 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
53312 +       if (err)
53313 +               goto fail;
53314 +
53315 +       memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
53316 +       network_connect(netdev);
53317 +       info->irq = bind_evtchn_to_irqhandler(
53318 +               info->evtchn, netif_int, SA_SAMPLE_RANDOM, netdev->name,
53319 +               netdev);
53320 +       (void)send_fake_arp(netdev);
53321 +       show_device(info);
53322 +
53323 +       return 0;
53324 +
53325 + fail:
53326 +       netif_free(info);
53327 +       return err;
53328 +}
53329 +
53330 +
53331 +/**
53332 + * Callback received when the backend's state changes.
53333 + */
53334 +static void backend_changed(struct xenbus_device *dev,
53335 +                           XenbusState backend_state)
53336 +{
53337 +       DPRINTK("\n");
53338 +
53339 +       switch (backend_state) {
53340 +       case XenbusStateInitialising:
53341 +       case XenbusStateInitWait:
53342 +       case XenbusStateInitialised:
53343 +       case XenbusStateConnected:
53344 +       case XenbusStateUnknown:
53345 +       case XenbusStateClosed:
53346 +               break;
53347 +
53348 +       case XenbusStateClosing:
53349 +               netfront_closing(dev);
53350 +               break;
53351 +       }
53352 +}
53353 +
53354 +
53355 +/** Send a packet on a net device to encourage switches to learn the
53356 + * MAC. We send a fake ARP request.
53357 + *
53358 + * @param dev device
53359 + * @return 0 on success, error code otherwise
53360 + */
53361 +static int send_fake_arp(struct net_device *dev)
53362 +{
53363 +       struct sk_buff *skb;
53364 +       u32             src_ip, dst_ip;
53365 +
53366 +       dst_ip = INADDR_BROADCAST;
53367 +       src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
53368 +
53369 +       /* No IP? Then nothing to do. */
53370 +       if (src_ip == 0)
53371 +               return 0;
53372 +
53373 +       skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
53374 +                        dst_ip, dev, src_ip,
53375 +                        /*dst_hw*/ NULL, /*src_hw*/ NULL,
53376 +                        /*target_hw*/ dev->dev_addr);
53377 +       if (skb == NULL)
53378 +               return -ENOMEM;
53379 +
53380 +       return dev_queue_xmit(skb);
53381 +}
53382 +
53383 +
53384 +static int network_open(struct net_device *dev)
53385 +{
53386 +       struct netfront_info *np = netdev_priv(dev);
53387 +
53388 +       memset(&np->stats, 0, sizeof(np->stats));
53389 +
53390 +       np->user_state = UST_OPEN;
53391 +
53392 +       network_alloc_rx_buffers(dev);
53393 +       np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
53394 +
53395 +       netif_start_queue(dev);
53396 +
53397 +       return 0;
53398 +}
53399 +
53400 +static void network_tx_buf_gc(struct net_device *dev)
53401 +{
53402 +       RING_IDX i, prod;
53403 +       unsigned short id;
53404 +       struct netfront_info *np = netdev_priv(dev);
53405 +       struct sk_buff *skb;
53406 +
53407 +       if (np->backend_state != BEST_CONNECTED)
53408 +               return;
53409 +
53410 +       do {
53411 +               prod = np->tx.sring->rsp_prod;
53412 +               rmb(); /* Ensure we see responses up to 'rp'. */
53413 +
53414 +               for (i = np->tx.rsp_cons; i != prod; i++) {
53415 +                       id  = RING_GET_RESPONSE(&np->tx, i)->id;
53416 +                       skb = np->tx_skbs[id];
53417 +                       if (unlikely(gnttab_query_foreign_access(
53418 +                               np->grant_tx_ref[id]) != 0)) {
53419 +                               printk(KERN_ALERT "network_tx_buf_gc: warning "
53420 +                                      "-- grant still in use by backend "
53421 +                                      "domain.\n");
53422 +                               goto out;
53423 +                       }
53424 +                       gnttab_end_foreign_access_ref(
53425 +                               np->grant_tx_ref[id], GNTMAP_readonly);
53426 +                       gnttab_release_grant_reference(
53427 +                               &np->gref_tx_head, np->grant_tx_ref[id]);
53428 +                       np->grant_tx_ref[id] = GRANT_INVALID_REF;
53429 +                       add_id_to_freelist(np->tx_skbs, id);
53430 +                       dev_kfree_skb_irq(skb);
53431 +               }
53432 +
53433 +               np->tx.rsp_cons = prod;
53434 +
53435 +               /*
53436 +                * Set a new event, then check for race with update of tx_cons.
53437 +                * Note that it is essential to schedule a callback, no matter
53438 +                * how few buffers are pending. Even if there is space in the
53439 +                * transmit ring, higher layers may be blocked because too much
53440 +                * data is outstanding: in such cases notification from Xen is
53441 +                * likely to be the only kick that we'll get.
53442 +                */
53443 +               np->tx.sring->rsp_event =
53444 +                       prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
53445 +               mb();
53446 +       } while (prod != np->tx.sring->rsp_prod);
53447 +
53448 + out:
53449 +       if (np->tx_full &&
53450 +           ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) {
53451 +               np->tx_full = 0;
53452 +               if (np->user_state == UST_OPEN)
53453 +                       netif_wake_queue(dev);
53454 +       }
53455 +}
53456 +
53457 +
53458 +static void rx_refill_timeout(unsigned long data)
53459 +{
53460 +       struct net_device *dev = (struct net_device *)data;
53461 +       netif_rx_schedule(dev);
53462 +}
53463 +
53464 +
53465 +static void network_alloc_rx_buffers(struct net_device *dev)
53466 +{
53467 +       unsigned short id;
53468 +       struct netfront_info *np = netdev_priv(dev);
53469 +       struct sk_buff *skb;
53470 +       int i, batch_target;
53471 +       RING_IDX req_prod = np->rx.req_prod_pvt;
53472 +       struct xen_memory_reservation reservation;
53473 +       grant_ref_t ref;
53474 +
53475 +       if (unlikely(np->backend_state != BEST_CONNECTED))
53476 +               return;
53477 +
53478 +       /*
53479 +        * Allocate skbuffs greedily, even though we batch updates to the
53480 +        * receive ring. This creates a less bursty demand on the memory
53481 +        * allocator, so should reduce the chance of failed allocation requests
53482 +        * both for ourself and for other kernel subsystems.
53483 +        */
53484 +       batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
53485 +       for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
53486 +               /*
53487 +                * Subtract dev_alloc_skb headroom (16 bytes) and shared info
53488 +                * tailroom then round down to SKB_DATA_ALIGN boundary.
53489 +                */
53490 +               skb = __dev_alloc_skb(
53491 +                       ((PAGE_SIZE - sizeof(struct skb_shared_info)) &
53492 +                        (-SKB_DATA_ALIGN(1))) - 16,
53493 +                       GFP_ATOMIC|__GFP_NOWARN);
53494 +               if (skb == NULL) {
53495 +                       /* Any skbuffs queued for refill? Force them out. */
53496 +                       if (i != 0)
53497 +                               goto refill;
53498 +                       /* Could not allocate any skbuffs. Try again later. */
53499 +                       mod_timer(&np->rx_refill_timer,
53500 +                                 jiffies + (HZ/10));
53501 +                       return;
53502 +               }
53503 +               __skb_queue_tail(&np->rx_batch, skb);
53504 +       }
53505 +
53506 +       /* Is the batch large enough to be worthwhile? */
53507 +       if (i < (np->rx_target/2))
53508 +               return;
53509 +
53510 +       /* Adjust our fill target if we risked running out of buffers. */
53511 +       if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
53512 +           ((np->rx_target *= 2) > np->rx_max_target))
53513 +               np->rx_target = np->rx_max_target;
53514 +
53515 + refill:
53516 +       for (i = 0; ; i++) {
53517 +               if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
53518 +                       break;
53519 +
53520 +               skb->dev = dev;
53521 +
53522 +               id = get_id_from_freelist(np->rx_skbs);
53523 +
53524 +               np->rx_skbs[id] = skb;
53525 +
53526 +               RING_GET_REQUEST(&np->rx, req_prod + i)->id = id;
53527 +               ref = gnttab_claim_grant_reference(&np->gref_rx_head);
53528 +               BUG_ON((signed short)ref < 0);
53529 +               np->grant_rx_ref[id] = ref;
53530 +               gnttab_grant_foreign_transfer_ref(ref,
53531 +                                                 np->xbdev->otherend_id,
53532 +                                                 __pa(skb->head) >> PAGE_SHIFT);
53533 +               RING_GET_REQUEST(&np->rx, req_prod + i)->gref = ref;
53534 +               np->rx_pfn_array[i] = virt_to_mfn(skb->head);
53535 +
53536 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
53537 +                       /* Remove this page before passing back to Xen. */
53538 +                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
53539 +                                           INVALID_P2M_ENTRY);
53540 +                       MULTI_update_va_mapping(np->rx_mcl+i,
53541 +                                               (unsigned long)skb->head,
53542 +                                               __pte(0), 0);
53543 +               }
53544 +       }
53545 +
53546 +       /* Tell the ballon driver what is going on. */
53547 +       balloon_update_driver_allowance(i);
53548 +
53549 +       reservation.extent_start = np->rx_pfn_array;
53550 +       reservation.nr_extents   = i;
53551 +       reservation.extent_order = 0;
53552 +       reservation.address_bits = 0;
53553 +       reservation.domid        = DOMID_SELF;
53554 +
53555 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
53556 +               /* After all PTEs have been zapped, flush the TLB. */
53557 +               np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
53558 +                       UVMF_TLB_FLUSH|UVMF_ALL;
53559 +
53560 +               /* Give away a batch of pages. */
53561 +               np->rx_mcl[i].op = __HYPERVISOR_memory_op;
53562 +               np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
53563 +               np->rx_mcl[i].args[1] = (unsigned long)&reservation;
53564 +
53565 +               /* Zap PTEs and give away pages in one big multicall. */
53566 +               (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
53567 +
53568 +               /* Check return status of HYPERVISOR_memory_op(). */
53569 +               if (unlikely(np->rx_mcl[i].result != i))
53570 +                       panic("Unable to reduce memory reservation\n");
53571 +       } else
53572 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
53573 +                                        &reservation) != i)
53574 +                       panic("Unable to reduce memory reservation\n");
53575 +
53576 +       /* Above is a suitable barrier to ensure backend will see requests. */
53577 +       np->rx.req_prod_pvt = req_prod + i;
53578 +       RING_PUSH_REQUESTS(&np->rx);
53579 +}
53580 +
53581 +
53582 +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
53583 +{
53584 +       unsigned short id;
53585 +       struct netfront_info *np = netdev_priv(dev);
53586 +       netif_tx_request_t *tx;
53587 +       RING_IDX i;
53588 +       grant_ref_t ref;
53589 +       unsigned long mfn;
53590 +       int notify;
53591 +
53592 +       if (unlikely(np->tx_full)) {
53593 +               printk(KERN_ALERT "%s: full queue wasn't stopped!\n",
53594 +                      dev->name);
53595 +               netif_stop_queue(dev);
53596 +               goto drop;
53597 +       }
53598 +
53599 +       if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
53600 +                    PAGE_SIZE)) {
53601 +               struct sk_buff *nskb;
53602 +               nskb = __dev_alloc_skb(skb->len, GFP_ATOMIC|__GFP_NOWARN);
53603 +               if (unlikely(nskb == NULL))
53604 +                       goto drop;
53605 +               skb_put(nskb, skb->len);
53606 +               memcpy(nskb->data, skb->data, skb->len);
53607 +               nskb->dev = skb->dev;
53608 +               dev_kfree_skb(skb);
53609 +               skb = nskb;
53610 +       }
53611 +
53612 +       spin_lock_irq(&np->tx_lock);
53613 +
53614 +       if (np->backend_state != BEST_CONNECTED) {
53615 +               spin_unlock_irq(&np->tx_lock);
53616 +               goto drop;
53617 +       }
53618 +
53619 +       i = np->tx.req_prod_pvt;
53620 +
53621 +       id = get_id_from_freelist(np->tx_skbs);
53622 +       np->tx_skbs[id] = skb;
53623 +
53624 +       tx = RING_GET_REQUEST(&np->tx, i);
53625 +
53626 +       tx->id   = id;
53627 +       ref = gnttab_claim_grant_reference(&np->gref_tx_head);
53628 +       BUG_ON((signed short)ref < 0);
53629 +       mfn = virt_to_mfn(skb->data);
53630 +       gnttab_grant_foreign_access_ref(
53631 +               ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
53632 +       tx->gref = np->grant_tx_ref[id] = ref;
53633 +       tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
53634 +       tx->size = skb->len;
53635 +
53636 +       tx->flags = 0;
53637 +       if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
53638 +               tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
53639 +       if (skb->proto_data_valid) /* remote but checksummed? */
53640 +               tx->flags |= NETTXF_data_validated;
53641 +
53642 +       np->tx.req_prod_pvt = i + 1;
53643 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
53644 +       if (notify)
53645 +               notify_remote_via_irq(np->irq);
53646 +
53647 +       network_tx_buf_gc(dev);
53648 +
53649 +       if (RING_FULL(&np->tx)) {
53650 +               np->tx_full = 1;
53651 +               netif_stop_queue(dev);
53652 +       }
53653 +
53654 +       spin_unlock_irq(&np->tx_lock);
53655 +
53656 +       np->stats.tx_bytes += skb->len;
53657 +       np->stats.tx_packets++;
53658 +
53659 +       return 0;
53660 +
53661 + drop:
53662 +       np->stats.tx_dropped++;
53663 +       dev_kfree_skb(skb);
53664 +       return 0;
53665 +}
53666 +
53667 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
53668 +{
53669 +       struct net_device *dev = dev_id;
53670 +       struct netfront_info *np = netdev_priv(dev);
53671 +       unsigned long flags;
53672 +
53673 +       spin_lock_irqsave(&np->tx_lock, flags);
53674 +       network_tx_buf_gc(dev);
53675 +       spin_unlock_irqrestore(&np->tx_lock, flags);
53676 +
53677 +       if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx) &&
53678 +           (np->user_state == UST_OPEN))
53679 +               netif_rx_schedule(dev);
53680 +
53681 +       return IRQ_HANDLED;
53682 +}
53683 +
53684 +
53685 +static int netif_poll(struct net_device *dev, int *pbudget)
53686 +{
53687 +       struct netfront_info *np = netdev_priv(dev);
53688 +       struct sk_buff *skb, *nskb;
53689 +       netif_rx_response_t *rx;
53690 +       RING_IDX i, rp;
53691 +       mmu_update_t *mmu = np->rx_mmu;
53692 +       multicall_entry_t *mcl = np->rx_mcl;
53693 +       int work_done, budget, more_to_do = 1;
53694 +       struct sk_buff_head rxq;
53695 +       unsigned long flags;
53696 +       unsigned long mfn;
53697 +       grant_ref_t ref;
53698 +
53699 +       spin_lock(&np->rx_lock);
53700 +
53701 +       if (np->backend_state != BEST_CONNECTED) {
53702 +               spin_unlock(&np->rx_lock);
53703 +               return 0;
53704 +       }
53705 +
53706 +       skb_queue_head_init(&rxq);
53707 +
53708 +       if ((budget = *pbudget) > dev->quota)
53709 +               budget = dev->quota;
53710 +       rp = np->rx.sring->rsp_prod;
53711 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
53712 +
53713 +       for (i = np->rx.rsp_cons, work_done = 0;
53714 +            (i != rp) && (work_done < budget);
53715 +            i++, work_done++) {
53716 +               rx = RING_GET_RESPONSE(&np->rx, i);
53717 +
53718 +               /*
53719 +                 * This definitely indicates a bug, either in this driver or
53720 +                 * in the backend driver. In future this should flag the bad
53721 +                 * situation to the system controller to reboot the backed.
53722 +                 */
53723 +               if ((ref = np->grant_rx_ref[rx->id]) == GRANT_INVALID_REF) {
53724 +                       WPRINTK("Bad rx response id %d.\n", rx->id);
53725 +                       work_done--;
53726 +                       continue;
53727 +               }
53728 +
53729 +               /* Memory pressure, insufficient buffer headroom, ... */
53730 +               if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) {
53731 +                       if (net_ratelimit())
53732 +                               WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
53733 +                                       rx->id, rx->status);
53734 +                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id =
53735 +                               rx->id;
53736 +                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref =
53737 +                               ref;
53738 +                       np->rx.req_prod_pvt++;
53739 +                       RING_PUSH_REQUESTS(&np->rx);
53740 +                       work_done--;
53741 +                       continue;
53742 +               }
53743 +
53744 +               gnttab_release_grant_reference(&np->gref_rx_head, ref);
53745 +               np->grant_rx_ref[rx->id] = GRANT_INVALID_REF;
53746 +
53747 +               skb = np->rx_skbs[rx->id];
53748 +               add_id_to_freelist(np->rx_skbs, rx->id);
53749 +
53750 +               /* NB. We handle skb overflow later. */
53751 +               skb->data = skb->head + rx->offset;
53752 +               skb->len  = rx->status;
53753 +               skb->tail = skb->data + skb->len;
53754 +
53755 +               /*
53756 +                * Old backends do not assert data_validated but we
53757 +                * can infer it from csum_blank so test both flags.
53758 +                */
53759 +               if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank)) {
53760 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
53761 +                       skb->proto_data_valid = 1;
53762 +               } else {
53763 +                       skb->ip_summed = CHECKSUM_NONE;
53764 +                       skb->proto_data_valid = 0;
53765 +               }
53766 +               skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
53767 +
53768 +               np->stats.rx_packets++;
53769 +               np->stats.rx_bytes += rx->status;
53770 +
53771 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
53772 +                       /* Remap the page. */
53773 +                       MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
53774 +                                               pfn_pte_ma(mfn, PAGE_KERNEL),
53775 +                                               0);
53776 +                       mcl++;
53777 +                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
53778 +                               | MMU_MACHPHYS_UPDATE;
53779 +                       mmu->val = __pa(skb->head) >> PAGE_SHIFT;
53780 +                       mmu++;
53781 +
53782 +                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
53783 +                                           mfn);
53784 +               }
53785 +
53786 +               __skb_queue_tail(&rxq, skb);
53787 +       }
53788 +
53789 +       /* Some pages are no longer absent... */
53790 +       balloon_update_driver_allowance(-work_done);
53791 +
53792 +       /* Do all the remapping work, and M2P updates, in one big hypercall. */
53793 +       if (likely((mcl - np->rx_mcl) != 0)) {
53794 +               mcl->op = __HYPERVISOR_mmu_update;
53795 +               mcl->args[0] = (unsigned long)np->rx_mmu;
53796 +               mcl->args[1] = mmu - np->rx_mmu;
53797 +               mcl->args[2] = 0;
53798 +               mcl->args[3] = DOMID_SELF;
53799 +               mcl++;
53800 +               (void)HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
53801 +       }
53802 +
53803 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
53804 +               if (skb->len > (dev->mtu + ETH_HLEN + 4)) {
53805 +                       if (net_ratelimit())
53806 +                               printk(KERN_INFO "Received packet too big for "
53807 +                                      "MTU (%d > %d)\n",
53808 +                                      skb->len - ETH_HLEN - 4, dev->mtu);
53809 +                       skb->len  = 0;
53810 +                       skb->tail = skb->data;
53811 +                       init_skb_shinfo(skb);
53812 +                       dev_kfree_skb(skb);
53813 +                       continue;
53814 +               }
53815 +
53816 +               /*
53817 +                * Enough room in skbuff for the data we were passed? Also,
53818 +                * Linux expects at least 16 bytes headroom in each rx buffer.
53819 +                */
53820 +               if (unlikely(skb->tail > skb->end) ||
53821 +                   unlikely((skb->data - skb->head) < 16)) {
53822 +                       if (net_ratelimit()) {
53823 +                               if (skb->tail > skb->end)
53824 +                                       printk(KERN_INFO "Received packet "
53825 +                                              "is %zd bytes beyond tail.\n",
53826 +                                              skb->tail - skb->end);
53827 +                               else
53828 +                                       printk(KERN_INFO "Received packet "
53829 +                                              "is %zd bytes before head.\n",
53830 +                                              16 - (skb->data - skb->head));
53831 +                       }
53832 +
53833 +                       nskb = __dev_alloc_skb(skb->len + 2,
53834 +                                              GFP_ATOMIC|__GFP_NOWARN);
53835 +                       if (nskb != NULL) {
53836 +                               skb_reserve(nskb, 2);
53837 +                               skb_put(nskb, skb->len);
53838 +                               memcpy(nskb->data, skb->data, skb->len);
53839 +                               nskb->dev = skb->dev;
53840 +                               nskb->ip_summed = skb->ip_summed;
53841 +                       }
53842 +
53843 +                       /* Reinitialise and then destroy the old skbuff. */
53844 +                       skb->len  = 0;
53845 +                       skb->tail = skb->data;
53846 +                       init_skb_shinfo(skb);
53847 +                       dev_kfree_skb(skb);
53848 +
53849 +                       /* Switch old for new, if we copied the buffer. */
53850 +                       if ((skb = nskb) == NULL)
53851 +                               continue;
53852 +               }
53853 +
53854 +               /* Set the shinfo area, which is hidden behind the data. */
53855 +               init_skb_shinfo(skb);
53856 +               /* Ethernet work: Delayed to here as it peeks the header. */
53857 +               skb->protocol = eth_type_trans(skb, dev);
53858 +
53859 +               /* Pass it up. */
53860 +               netif_receive_skb(skb);
53861 +               dev->last_rx = jiffies;
53862 +       }
53863 +
53864 +       np->rx.rsp_cons = i;
53865 +
53866 +       /* If we get a callback with very few responses, reduce fill target. */
53867 +       /* NB. Note exponential increase, linear decrease. */
53868 +       if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
53869 +            ((3*np->rx_target) / 4)) &&
53870 +           (--np->rx_target < np->rx_min_target))
53871 +               np->rx_target = np->rx_min_target;
53872 +
53873 +       network_alloc_rx_buffers(dev);
53874 +
53875 +       *pbudget   -= work_done;
53876 +       dev->quota -= work_done;
53877 +
53878 +       if (work_done < budget) {
53879 +               local_irq_save(flags);
53880 +
53881 +               RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
53882 +               if (!more_to_do)
53883 +                       __netif_rx_complete(dev);
53884 +
53885 +               local_irq_restore(flags);
53886 +       }
53887 +
53888 +       spin_unlock(&np->rx_lock);
53889 +
53890 +       return more_to_do;
53891 +}
53892 +
53893 +
53894 +static int network_close(struct net_device *dev)
53895 +{
53896 +       struct netfront_info *np = netdev_priv(dev);
53897 +       np->user_state = UST_CLOSED;
53898 +       netif_stop_queue(np->netdev);
53899 +       return 0;
53900 +}
53901 +
53902 +
53903 +static struct net_device_stats *network_get_stats(struct net_device *dev)
53904 +{
53905 +       struct netfront_info *np = netdev_priv(dev);
53906 +       return &np->stats;
53907 +}
53908 +
53909 +static void network_connect(struct net_device *dev)
53910 +{
53911 +       struct netfront_info *np;
53912 +       int i, requeue_idx;
53913 +       netif_tx_request_t *tx;
53914 +       struct sk_buff *skb;
53915 +
53916 +       np = netdev_priv(dev);
53917 +       spin_lock_irq(&np->tx_lock);
53918 +       spin_lock(&np->rx_lock);
53919 +
53920 +       /* Recovery procedure: */
53921 +
53922 +       /* Step 1: Reinitialise variables. */
53923 +       np->tx_full = 0;
53924 +
53925 +       /*
53926 +        * Step 2: Rebuild the RX and TX ring contents.
53927 +        * NB. We could just free the queued TX packets now but we hope
53928 +        * that sending them out might do some good.  We have to rebuild
53929 +        * the RX ring because some of our pages are currently flipped out
53930 +        * so we can't just free the RX skbs.
53931 +        * NB2. Freelist index entries are always going to be less than
53932 +        *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
53933 +        * greater than __PAGE_OFFSET: we use this property to distinguish
53934 +        * them.
53935 +        */
53936 +
53937 +       /*
53938 +        * Rebuild the TX buffer freelist and the TX ring itself.
53939 +        * NB. This reorders packets.  We could keep more private state
53940 +        * to avoid this but maybe it doesn't matter so much given the
53941 +        * interface has been down.
53942 +        */
53943 +       for (requeue_idx = 0, i = 1; i <= NET_TX_RING_SIZE; i++) {
53944 +               if ((unsigned long)np->tx_skbs[i] < __PAGE_OFFSET)
53945 +                       continue;
53946 +
53947 +               skb = np->tx_skbs[i];
53948 +
53949 +               tx = RING_GET_REQUEST(&np->tx, requeue_idx);
53950 +               requeue_idx++;
53951 +
53952 +               tx->id = i;
53953 +               gnttab_grant_foreign_access_ref(
53954 +                       np->grant_tx_ref[i], np->xbdev->otherend_id,
53955 +                       virt_to_mfn(np->tx_skbs[i]->data),
53956 +                       GNTMAP_readonly);
53957 +               tx->gref = np->grant_tx_ref[i];
53958 +               tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
53959 +               tx->size = skb->len;
53960 +               tx->flags = 0;
53961 +               if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
53962 +                       tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
53963 +               if (skb->proto_data_valid) /* remote but checksummed? */
53964 +                       tx->flags |= NETTXF_data_validated;
53965 +
53966 +               np->stats.tx_bytes += skb->len;
53967 +               np->stats.tx_packets++;
53968 +       }
53969 +
53970 +       np->tx.req_prod_pvt = requeue_idx;
53971 +       RING_PUSH_REQUESTS(&np->tx);
53972 +
53973 +       /* Rebuild the RX buffer freelist and the RX ring itself. */
53974 +       for (requeue_idx = 0, i = 1; i <= NET_RX_RING_SIZE; i++) {
53975 +               if ((unsigned long)np->rx_skbs[i] < __PAGE_OFFSET)
53976 +                       continue;
53977 +               gnttab_grant_foreign_transfer_ref(
53978 +                       np->grant_rx_ref[i], np->xbdev->otherend_id,
53979 +                       __pa(np->rx_skbs[i]->data) >> PAGE_SHIFT);
53980 +               RING_GET_REQUEST(&np->rx, requeue_idx)->gref =
53981 +                       np->grant_rx_ref[i];
53982 +               RING_GET_REQUEST(&np->rx, requeue_idx)->id = i;
53983 +               requeue_idx++;
53984 +       }
53985 +
53986 +       np->rx.req_prod_pvt = requeue_idx;
53987 +       RING_PUSH_REQUESTS(&np->rx);
53988 +
53989 +       /*
53990 +        * Step 3: All public and private state should now be sane.  Get
53991 +        * ready to start sending and receiving packets and give the driver
53992 +        * domain a kick because we've probably just requeued some
53993 +        * packets.
53994 +        */
53995 +       np->backend_state = BEST_CONNECTED;
53996 +       notify_remote_via_irq(np->irq);
53997 +       network_tx_buf_gc(dev);
53998 +
53999 +       if (np->user_state == UST_OPEN)
54000 +               netif_start_queue(dev);
54001 +
54002 +       spin_unlock(&np->rx_lock);
54003 +       spin_unlock_irq(&np->tx_lock);
54004 +}
54005 +
54006 +static void show_device(struct netfront_info *np)
54007 +{
54008 +#ifdef DEBUG
54009 +       if (np) {
54010 +               IPRINTK("<vif handle=%u %s(%s) evtchn=%u tx=%p rx=%p>\n",
54011 +                       np->handle,
54012 +                       be_state_name[np->backend_state],
54013 +                       np->user_state ? "open" : "closed",
54014 +                       np->evtchn,
54015 +                       np->tx,
54016 +                       np->rx);
54017 +       } else
54018 +               IPRINTK("<vif NULL>\n");
54019 +#endif
54020 +}
54021 +
54022 +static void netif_uninit(struct net_device *dev)
54023 +{
54024 +       struct netfront_info *np = netdev_priv(dev);
54025 +       gnttab_free_grant_references(np->gref_tx_head);
54026 +       gnttab_free_grant_references(np->gref_rx_head);
54027 +}
54028 +
54029 +static struct ethtool_ops network_ethtool_ops =
54030 +{
54031 +       .get_tx_csum = ethtool_op_get_tx_csum,
54032 +       .set_tx_csum = ethtool_op_set_tx_csum,
54033 +};
54034 +
54035 +/** Create a network device.
54036 + * @param handle device handle
54037 + * @param val return parameter for created device
54038 + * @return 0 on success, error code otherwise
54039 + */
54040 +static int create_netdev(int handle, struct xenbus_device *dev,
54041 +                        struct net_device **val)
54042 +{
54043 +       int i, err = 0;
54044 +       struct net_device *netdev = NULL;
54045 +       struct netfront_info *np = NULL;
54046 +
54047 +       if ((netdev = alloc_etherdev(sizeof(struct netfront_info))) == NULL) {
54048 +               printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
54049 +                      __FUNCTION__);
54050 +               err = -ENOMEM;
54051 +               goto exit;
54052 +       }
54053 +
54054 +       np                = netdev_priv(netdev);
54055 +       np->backend_state = BEST_CLOSED;
54056 +       np->user_state    = UST_CLOSED;
54057 +       np->handle        = handle;
54058 +       np->xbdev         = dev;
54059 +
54060 +       spin_lock_init(&np->tx_lock);
54061 +       spin_lock_init(&np->rx_lock);
54062 +
54063 +       skb_queue_head_init(&np->rx_batch);
54064 +       np->rx_target     = RX_DFL_MIN_TARGET;
54065 +       np->rx_min_target = RX_DFL_MIN_TARGET;
54066 +       np->rx_max_target = RX_MAX_TARGET;
54067 +
54068 +       init_timer(&np->rx_refill_timer);
54069 +       np->rx_refill_timer.data = (unsigned long)netdev;
54070 +       np->rx_refill_timer.function = rx_refill_timeout;
54071 +
54072 +       /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
54073 +       for (i = 0; i <= NET_TX_RING_SIZE; i++) {
54074 +               np->tx_skbs[i] = (void *)((unsigned long) i+1);
54075 +               np->grant_tx_ref[i] = GRANT_INVALID_REF;
54076 +       }
54077 +
54078 +       for (i = 0; i <= NET_RX_RING_SIZE; i++) {
54079 +               np->rx_skbs[i] = (void *)((unsigned long) i+1);
54080 +               np->grant_rx_ref[i] = GRANT_INVALID_REF;
54081 +       }
54082 +
54083 +       /* A grant for every tx ring slot */
54084 +       if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
54085 +                                         &np->gref_tx_head) < 0) {
54086 +               printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
54087 +               err = -ENOMEM;
54088 +               goto exit;
54089 +       }
54090 +       /* A grant for every rx ring slot */
54091 +       if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
54092 +                                         &np->gref_rx_head) < 0) {
54093 +               printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
54094 +               gnttab_free_grant_references(np->gref_tx_head);
54095 +               err = -ENOMEM;
54096 +               goto exit;
54097 +       }
54098 +
54099 +       netdev->open            = network_open;
54100 +       netdev->hard_start_xmit = network_start_xmit;
54101 +       netdev->stop            = network_close;
54102 +       netdev->get_stats       = network_get_stats;
54103 +       netdev->poll            = netif_poll;
54104 +       netdev->uninit          = netif_uninit;
54105 +       netdev->weight          = 64;
54106 +       netdev->features        = NETIF_F_IP_CSUM;
54107 +
54108 +       SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
54109 +       SET_MODULE_OWNER(netdev);
54110 +       SET_NETDEV_DEV(netdev, &dev->dev);
54111 +
54112 +       if ((err = register_netdev(netdev)) != 0) {
54113 +               printk(KERN_WARNING "%s> register_netdev err=%d\n",
54114 +                      __FUNCTION__, err);
54115 +               goto exit_free_grefs;
54116 +       }
54117 +
54118 +       if ((err = xennet_proc_addif(netdev)) != 0) {
54119 +               unregister_netdev(netdev);
54120 +               goto exit_free_grefs;
54121 +       }
54122 +
54123 +       np->netdev = netdev;
54124 +
54125 + exit:
54126 +       if (err != 0)
54127 +               kfree(netdev);
54128 +       else if (val != NULL)
54129 +               *val = netdev;
54130 +       return err;
54131 +
54132 + exit_free_grefs:
54133 +       gnttab_free_grant_references(np->gref_tx_head);
54134 +       gnttab_free_grant_references(np->gref_rx_head);
54135 +       goto exit;
54136 +}
54137 +
54138 +/*
54139 + * We use this notifier to send out a fake ARP reply to reset switches and
54140 + * router ARP caches when an IP interface is brought up on a VIF.
54141 + */
54142 +static int
54143 +inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
54144 +{
54145 +       struct in_ifaddr  *ifa = (struct in_ifaddr *)ptr;
54146 +       struct net_device *dev = ifa->ifa_dev->dev;
54147 +
54148 +       /* UP event and is it one of our devices? */
54149 +       if (event == NETDEV_UP && dev->open == network_open)
54150 +               (void)send_fake_arp(dev);
54151 +
54152 +       return NOTIFY_DONE;
54153 +}
54154 +
54155 +
54156 +/* ** Close down ** */
54157 +
54158 +
54159 +/**
54160 + * Handle the change of state of the backend to Closing.  We must delete our
54161 + * device-layer structures now, to ensure that writes are flushed through to
54162 + * the backend.  Once is this done, we can switch to Closed in
54163 + * acknowledgement.
54164 + */
54165 +static void netfront_closing(struct xenbus_device *dev)
54166 +{
54167 +       struct netfront_info *info = dev->data;
54168 +
54169 +       DPRINTK("netfront_closing: %s removed\n", dev->nodename);
54170 +
54171 +       close_netdev(info);
54172 +
54173 +       xenbus_switch_state(dev, XenbusStateClosed);
54174 +}
54175 +
54176 +
54177 +static int netfront_remove(struct xenbus_device *dev)
54178 +{
54179 +       struct netfront_info *info = dev->data;
54180 +
54181 +       DPRINTK("%s\n", dev->nodename);
54182 +
54183 +       netif_disconnect_backend(info);
54184 +       free_netdev(info->netdev);
54185 +
54186 +       return 0;
54187 +}
54188 +
54189 +
54190 +static void close_netdev(struct netfront_info *info)
54191 +{
54192 +       spin_lock_irq(&info->netdev->xmit_lock);
54193 +       netif_stop_queue(info->netdev);
54194 +       spin_unlock_irq(&info->netdev->xmit_lock);
54195 +
54196 +#ifdef CONFIG_PROC_FS
54197 +       xennet_proc_delif(info->netdev);
54198 +#endif
54199 +
54200 +       del_timer_sync(&info->rx_refill_timer);
54201 +
54202 +       unregister_netdev(info->netdev);
54203 +}
54204 +
54205 +
54206 +static void netif_disconnect_backend(struct netfront_info *info)
54207 +{
54208 +       /* Stop old i/f to prevent errors whilst we rebuild the state. */
54209 +       spin_lock_irq(&info->tx_lock);
54210 +       spin_lock(&info->rx_lock);
54211 +       info->backend_state = BEST_DISCONNECTED;
54212 +       spin_unlock(&info->rx_lock);
54213 +       spin_unlock_irq(&info->tx_lock);
54214 +
54215 +       if (info->irq)
54216 +               unbind_from_irqhandler(info->irq, info->netdev);
54217 +       info->evtchn = info->irq = 0;
54218 +
54219 +       end_access(info->tx_ring_ref, info->tx.sring);
54220 +       end_access(info->rx_ring_ref, info->rx.sring);
54221 +       info->tx_ring_ref = GRANT_INVALID_REF;
54222 +       info->rx_ring_ref = GRANT_INVALID_REF;
54223 +       info->tx.sring = NULL;
54224 +       info->rx.sring = NULL;
54225 +}
54226 +
54227 +
54228 +static void netif_free(struct netfront_info *info)
54229 +{
54230 +       close_netdev(info);
54231 +       netif_disconnect_backend(info);
54232 +       free_netdev(info->netdev);
54233 +}
54234 +
54235 +
54236 +static void end_access(int ref, void *page)
54237 +{
54238 +       if (ref != GRANT_INVALID_REF)
54239 +               gnttab_end_foreign_access(ref, 0, (unsigned long)page);
54240 +}
54241 +
54242 +
54243 +/* ** Driver registration ** */
54244 +
54245 +
54246 +static struct xenbus_device_id netfront_ids[] = {
54247 +       { "vif" },
54248 +       { "" }
54249 +};
54250 +
54251 +
54252 +static struct xenbus_driver netfront = {
54253 +       .name = "vif",
54254 +       .owner = THIS_MODULE,
54255 +       .ids = netfront_ids,
54256 +       .probe = netfront_probe,
54257 +       .remove = netfront_remove,
54258 +       .resume = netfront_resume,
54259 +       .otherend_changed = backend_changed,
54260 +};
54261 +
54262 +
54263 +static struct notifier_block notifier_inetdev = {
54264 +       .notifier_call  = inetdev_notify,
54265 +       .next           = NULL,
54266 +       .priority       = 0
54267 +};
54268 +
54269 +static int __init netif_init(void)
54270 +{
54271 +       int err = 0;
54272 +
54273 +       if (xen_start_info->flags & SIF_INITDOMAIN)
54274 +               return 0;
54275 +
54276 +       if ((err = xennet_proc_init()) != 0)
54277 +               return err;
54278 +
54279 +       IPRINTK("Initialising virtual ethernet driver.\n");
54280 +
54281 +       (void)register_inetaddr_notifier(&notifier_inetdev);
54282 +
54283 +       return xenbus_register_frontend(&netfront);
54284 +}
54285 +module_init(netif_init);
54286 +
54287 +
54288 +static void netif_exit(void)
54289 +{
54290 +       unregister_inetaddr_notifier(&notifier_inetdev);
54291 +
54292 +       return xenbus_unregister_driver(&netfront);
54293 +}
54294 +module_exit(netif_exit);
54295 +
54296 +MODULE_LICENSE("Dual BSD/GPL");
54297 +
54298 +
54299 +/* ** /proc **/
54300 +
54301 +
54302 +#ifdef CONFIG_PROC_FS
54303 +
54304 +#define TARGET_MIN 0UL
54305 +#define TARGET_MAX 1UL
54306 +#define TARGET_CUR 2UL
54307 +
54308 +static int xennet_proc_read(
54309 +       char *page, char **start, off_t off, int count, int *eof, void *data)
54310 +{
54311 +       struct net_device *dev =
54312 +               (struct net_device *)((unsigned long)data & ~3UL);
54313 +       struct netfront_info *np = netdev_priv(dev);
54314 +       int len = 0, which_target = (long)data & 3;
54315 +
54316 +       switch (which_target) {
54317 +       case TARGET_MIN:
54318 +               len = sprintf(page, "%d\n", np->rx_min_target);
54319 +               break;
54320 +       case TARGET_MAX:
54321 +               len = sprintf(page, "%d\n", np->rx_max_target);
54322 +               break;
54323 +       case TARGET_CUR:
54324 +               len = sprintf(page, "%d\n", np->rx_target);
54325 +               break;
54326 +       }
54327 +
54328 +       *eof = 1;
54329 +       return len;
54330 +}
54331 +
54332 +static int xennet_proc_write(
54333 +       struct file *file, const char __user *buffer,
54334 +       unsigned long count, void *data)
54335 +{
54336 +       struct net_device *dev =
54337 +               (struct net_device *)((unsigned long)data & ~3UL);
54338 +       struct netfront_info *np = netdev_priv(dev);
54339 +       int which_target = (long)data & 3;
54340 +       char string[64];
54341 +       long target;
54342 +
54343 +       if (!capable(CAP_SYS_ADMIN))
54344 +               return -EPERM;
54345 +
54346 +       if (count <= 1)
54347 +               return -EBADMSG; /* runt */
54348 +       if (count > sizeof(string))
54349 +               return -EFBIG;   /* too long */
54350 +
54351 +       if (copy_from_user(string, buffer, count))
54352 +               return -EFAULT;
54353 +       string[sizeof(string)-1] = '\0';
54354 +
54355 +       target = simple_strtol(string, NULL, 10);
54356 +       if (target < RX_MIN_TARGET)
54357 +               target = RX_MIN_TARGET;
54358 +       if (target > RX_MAX_TARGET)
54359 +               target = RX_MAX_TARGET;
54360 +
54361 +       spin_lock(&np->rx_lock);
54362 +
54363 +       switch (which_target) {
54364 +       case TARGET_MIN:
54365 +               if (target > np->rx_max_target)
54366 +                       np->rx_max_target = target;
54367 +               np->rx_min_target = target;
54368 +               if (target > np->rx_target)
54369 +                       np->rx_target = target;
54370 +               break;
54371 +       case TARGET_MAX:
54372 +               if (target < np->rx_min_target)
54373 +                       np->rx_min_target = target;
54374 +               np->rx_max_target = target;
54375 +               if (target < np->rx_target)
54376 +                       np->rx_target = target;
54377 +               break;
54378 +       case TARGET_CUR:
54379 +               break;
54380 +       }
54381 +
54382 +       network_alloc_rx_buffers(dev);
54383 +
54384 +       spin_unlock(&np->rx_lock);
54385 +
54386 +       return count;
54387 +}
54388 +
54389 +static int xennet_proc_init(void)
54390 +{
54391 +       if (proc_mkdir("xen/net", NULL) == NULL)
54392 +               return -ENOMEM;
54393 +       return 0;
54394 +}
54395 +
54396 +static int xennet_proc_addif(struct net_device *dev)
54397 +{
54398 +       struct proc_dir_entry *dir, *min, *max, *cur;
54399 +       char name[30];
54400 +
54401 +       sprintf(name, "xen/net/%s", dev->name);
54402 +
54403 +       dir = proc_mkdir(name, NULL);
54404 +       if (!dir)
54405 +               goto nomem;
54406 +
54407 +       min = create_proc_entry("rxbuf_min", 0644, dir);
54408 +       max = create_proc_entry("rxbuf_max", 0644, dir);
54409 +       cur = create_proc_entry("rxbuf_cur", 0444, dir);
54410 +       if (!min || !max || !cur)
54411 +               goto nomem;
54412 +
54413 +       min->read_proc  = xennet_proc_read;
54414 +       min->write_proc = xennet_proc_write;
54415 +       min->data       = (void *)((unsigned long)dev | TARGET_MIN);
54416 +
54417 +       max->read_proc  = xennet_proc_read;
54418 +       max->write_proc = xennet_proc_write;
54419 +       max->data       = (void *)((unsigned long)dev | TARGET_MAX);
54420 +
54421 +       cur->read_proc  = xennet_proc_read;
54422 +       cur->write_proc = xennet_proc_write;
54423 +       cur->data       = (void *)((unsigned long)dev | TARGET_CUR);
54424 +
54425 +       return 0;
54426 +
54427 + nomem:
54428 +       xennet_proc_delif(dev);
54429 +       return -ENOMEM;
54430 +}
54431 +
54432 +static void xennet_proc_delif(struct net_device *dev)
54433 +{
54434 +       char name[30];
54435 +
54436 +       sprintf(name, "xen/net/%s/rxbuf_min", dev->name);
54437 +       remove_proc_entry(name, NULL);
54438 +
54439 +       sprintf(name, "xen/net/%s/rxbuf_max", dev->name);
54440 +       remove_proc_entry(name, NULL);
54441 +
54442 +       sprintf(name, "xen/net/%s/rxbuf_cur", dev->name);
54443 +       remove_proc_entry(name, NULL);
54444 +
54445 +       sprintf(name, "xen/net/%s", dev->name);
54446 +       remove_proc_entry(name, NULL);
54447 +}
54448 +
54449 +#endif
54450 +
54451 +
54452 +/*
54453 + * Local variables:
54454 + *  c-file-style: "linux"
54455 + *  indent-tabs-mode: t
54456 + *  c-indent-level: 8
54457 + *  c-basic-offset: 8
54458 + *  tab-width: 8
54459 + * End:
54460 + */
54461 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/conf_space.c tmp-linux-2.6-xen.patch/drivers/xen/pciback/conf_space.c
54462 --- ref-linux-2.6.16.9/drivers/xen/pciback/conf_space.c 1970-01-01 01:00:00.000000000 +0100
54463 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/conf_space.c    2006-04-10 00:05:52.000000000 +0200
54464 @@ -0,0 +1,345 @@
54465 +/*
54466 + * PCI Backend - Functions for creating a virtual configuration space for
54467 + *               exported PCI Devices.
54468 + *               It's dangerous to allow PCI Driver Domains to change their
54469 + *               device's resources (memory, i/o ports, interrupts). We need to
54470 + *               restrict changes to certain PCI Configuration registers:
54471 + *               BARs, INTERRUPT_PIN, most registers in the header...
54472 + *
54473 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
54474 + */
54475 +
54476 +#include <linux/kernel.h>
54477 +#include <linux/pci.h>
54478 +#include "pciback.h"
54479 +#include "conf_space.h"
54480 +
54481 +static int permissive = 0;
54482 +module_param(permissive, bool, 0644);
54483 +
54484 +#define DEFINE_PCI_CONFIG(op,size,type)                                        \
54485 +int pciback_##op##_config_##size                                                       \
54486 +(struct pci_dev *dev, int offset, type value, void *data)      \
54487 +{                                                                                                                      \
54488 +       return pci_##op##_config_##size (dev, offset, value);   \
54489 +}
54490 +
54491 +DEFINE_PCI_CONFIG(read, byte, u8 *)
54492 +DEFINE_PCI_CONFIG(read, word, u16 *)
54493 +DEFINE_PCI_CONFIG(read, dword, u32 *)
54494 +
54495 +DEFINE_PCI_CONFIG(write, byte, u8)
54496 +DEFINE_PCI_CONFIG(write, word, u16)
54497 +DEFINE_PCI_CONFIG(write, dword, u32)
54498 +
54499 +static int conf_space_read(struct pci_dev *dev,
54500 +                          struct config_field_entry *entry, int offset,
54501 +                          u32 * value)
54502 +{
54503 +       int ret = 0;
54504 +       struct config_field *field = entry->field;
54505 +
54506 +       *value = 0;
54507 +
54508 +       switch (field->size) {
54509 +       case 1:
54510 +               if (field->u.b.read)
54511 +                       ret = field->u.b.read(dev, offset, (u8 *) value,
54512 +                                             entry->data);
54513 +               break;
54514 +       case 2:
54515 +               if (field->u.w.read)
54516 +                       ret = field->u.w.read(dev, offset, (u16 *) value,
54517 +                                             entry->data);
54518 +               break;
54519 +       case 4:
54520 +               if (field->u.dw.read)
54521 +                       ret = field->u.dw.read(dev, offset, value, entry->data);
54522 +               break;
54523 +       }
54524 +       return ret;
54525 +}
54526 +
54527 +static int conf_space_write(struct pci_dev *dev,
54528 +                           struct config_field_entry *entry, int offset,
54529 +                           u32 value)
54530 +{
54531 +       int ret = 0;
54532 +       struct config_field *field = entry->field;
54533 +
54534 +       switch (field->size) {
54535 +       case 1:
54536 +               if (field->u.b.write)
54537 +                       ret = field->u.b.write(dev, offset, (u8) value,
54538 +                                              entry->data);
54539 +               break;
54540 +       case 2:
54541 +               if (field->u.w.write)
54542 +                       ret = field->u.w.write(dev, offset, (u16) value,
54543 +                                              entry->data);
54544 +               break;
54545 +       case 4:
54546 +               if (field->u.dw.write)
54547 +                       ret = field->u.dw.write(dev, offset, value,
54548 +                                               entry->data);
54549 +               break;
54550 +       }
54551 +       return ret;
54552 +}
54553 +
54554 +static inline u32 get_mask(int size)
54555 +{
54556 +       if (size == 1)
54557 +               return 0xff;
54558 +       else if (size == 2)
54559 +               return 0xffff;
54560 +       else
54561 +               return 0xffffffff;
54562 +}
54563 +
54564 +static inline int valid_request(int offset, int size)
54565 +{
54566 +       /* Validate request (no un-aligned requests) */
54567 +       if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
54568 +               return 1;
54569 +       return 0;
54570 +}
54571 +
54572 +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
54573 +                             int offset)
54574 +{
54575 +       if (offset >= 0) {
54576 +               new_val_mask <<= (offset * 8);
54577 +               new_val <<= (offset * 8);
54578 +       } else {
54579 +               new_val_mask >>= (offset * -8);
54580 +               new_val >>= (offset * -8);
54581 +       }
54582 +       val = (val & ~new_val_mask) | (new_val & new_val_mask);
54583 +
54584 +       return val;
54585 +}
54586 +
54587 +static int pcibios_err_to_errno(int err)
54588 +{
54589 +       switch (err) {
54590 +       case PCIBIOS_SUCCESSFUL:
54591 +               return XEN_PCI_ERR_success;
54592 +       case PCIBIOS_DEVICE_NOT_FOUND:
54593 +               return XEN_PCI_ERR_dev_not_found;
54594 +       case PCIBIOS_BAD_REGISTER_NUMBER:
54595 +               return XEN_PCI_ERR_invalid_offset;
54596 +       case PCIBIOS_FUNC_NOT_SUPPORTED:
54597 +               return XEN_PCI_ERR_not_implemented;
54598 +       case PCIBIOS_SET_FAILED:
54599 +               return XEN_PCI_ERR_access_denied;
54600 +       }
54601 +       return err;
54602 +}
54603 +
54604 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
54605 +                       u32 * ret_val)
54606 +{
54607 +       int err = 0;
54608 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54609 +       struct config_field_entry *cfg_entry;
54610 +       struct config_field *field;
54611 +       int req_start, req_end, field_start, field_end;
54612 +       /* if read fails for any reason, return 0 (as if device didn't respond) */
54613 +       u32 value = 0, tmp_val;
54614 +
54615 +       if (unlikely(verbose_request))
54616 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
54617 +                      pci_name(dev), size, offset);
54618 +
54619 +       if (!valid_request(offset, size)) {
54620 +               err = XEN_PCI_ERR_invalid_offset;
54621 +               goto out;
54622 +       }
54623 +
54624 +       /* Get the real value first, then modify as appropriate */
54625 +       switch (size) {
54626 +       case 1:
54627 +               err = pci_read_config_byte(dev, offset, (u8 *) & value);
54628 +               break;
54629 +       case 2:
54630 +               err = pci_read_config_word(dev, offset, (u16 *) & value);
54631 +               break;
54632 +       case 4:
54633 +               err = pci_read_config_dword(dev, offset, &value);
54634 +               break;
54635 +       }
54636 +
54637 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
54638 +               field = cfg_entry->field;
54639 +
54640 +               req_start = offset;
54641 +               req_end = offset + size;
54642 +               field_start = field->offset;
54643 +               field_end = field->offset + field->size;
54644 +
54645 +               if ((req_start >= field_start && req_start < field_end)
54646 +                   || (req_end > field_start && req_end <= field_end)) {
54647 +                       err = conf_space_read(dev, cfg_entry, field_start,
54648 +                                             &tmp_val);
54649 +                       if (err)
54650 +                               goto out;
54651 +
54652 +                       value = merge_value(value, tmp_val,
54653 +                                           get_mask(field->size),
54654 +                                           field_start - req_start);
54655 +               }
54656 +       }
54657 +
54658 +      out:
54659 +       if (unlikely(verbose_request))
54660 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
54661 +                      pci_name(dev), size, offset, value);
54662 +
54663 +       *ret_val = value;
54664 +       return pcibios_err_to_errno(err);
54665 +}
54666 +
54667 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
54668 +{
54669 +       int err = 0, handled = 0;
54670 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54671 +       struct config_field_entry *cfg_entry;
54672 +       struct config_field *field;
54673 +       u32 tmp_val;
54674 +       int req_start, req_end, field_start, field_end;
54675 +
54676 +       if (unlikely(verbose_request))
54677 +               printk(KERN_DEBUG
54678 +                      "pciback: %s: write request %d bytes at 0x%x = %x\n",
54679 +                      pci_name(dev), size, offset, value);
54680 +
54681 +       if (!valid_request(offset, size))
54682 +               return XEN_PCI_ERR_invalid_offset;
54683 +
54684 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
54685 +               field = cfg_entry->field;
54686 +
54687 +               req_start = offset;
54688 +               req_end = offset + size;
54689 +               field_start = field->offset;
54690 +               field_end = field->offset + field->size;
54691 +
54692 +               if ((req_start >= field_start && req_start < field_end)
54693 +                   || (req_end > field_start && req_end <= field_end)) {
54694 +                       tmp_val = 0;
54695 +
54696 +                       err = pciback_config_read(dev, field_start,
54697 +                                                 field->size, &tmp_val);
54698 +                       if (err)
54699 +                               break;
54700 +
54701 +                       tmp_val = merge_value(tmp_val, value, get_mask(size),
54702 +                                             req_start - field_start);
54703 +
54704 +                       err = conf_space_write(dev, cfg_entry, field_start,
54705 +                                              tmp_val);
54706 +                       handled = 1;
54707 +               }
54708 +       }
54709 +
54710 +       if (!handled && !err && permissive) {
54711 +               switch (size) {
54712 +               case 1:
54713 +                       err = pci_write_config_byte(dev, offset, (u8)value);
54714 +                       break;
54715 +               case 2:
54716 +                       err = pci_write_config_word(dev, offset, (u16)value);
54717 +                       break;
54718 +               case 4:
54719 +                       err = pci_write_config_dword(dev, offset, (u32)value);
54720 +                       break;
54721 +               }
54722 +       }
54723 +
54724 +       return pcibios_err_to_errno(err);
54725 +}
54726 +
54727 +void pciback_config_reset(struct pci_dev *dev)
54728 +{
54729 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54730 +       struct config_field_entry *cfg_entry;
54731 +       struct config_field *field;
54732 +
54733 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
54734 +               field = cfg_entry->field;
54735 +
54736 +               if (field->reset)
54737 +                       field->reset(dev, field->offset, cfg_entry->data);
54738 +       }
54739 +}
54740 +
54741 +void pciback_config_free(struct pci_dev *dev)
54742 +{
54743 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54744 +       struct config_field_entry *cfg_entry, *t;
54745 +       struct config_field *field;
54746 +
54747 +       list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
54748 +               list_del(&cfg_entry->list);
54749 +
54750 +               field = cfg_entry->field;
54751 +
54752 +               if (field->release)
54753 +                       field->release(dev, field->offset, cfg_entry->data);
54754 +
54755 +               kfree(cfg_entry);
54756 +       }
54757 +}
54758 +
54759 +int pciback_config_add_field(struct pci_dev *dev, struct config_field *field)
54760 +{
54761 +       int err = 0;
54762 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54763 +       struct config_field_entry *cfg_entry;
54764 +       void *tmp;
54765 +
54766 +       cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
54767 +       if (!cfg_entry) {
54768 +               err = -ENOMEM;
54769 +               goto out;
54770 +       }
54771 +
54772 +       cfg_entry->data = NULL;
54773 +       cfg_entry->field = field;
54774 +
54775 +       if (field->init) {
54776 +               tmp = field->init(dev, field->offset);
54777 +
54778 +               if (IS_ERR(tmp)) {
54779 +                       err = PTR_ERR(tmp);
54780 +                       goto out;
54781 +               }
54782 +
54783 +               cfg_entry->data = tmp;
54784 +       }
54785 +
54786 +       list_add_tail(&cfg_entry->list, &dev_data->config_fields);
54787 +
54788 +      out:
54789 +       if (err)
54790 +               kfree(cfg_entry);
54791 +
54792 +       return err;
54793 +}
54794 +
54795 +/* This sets up the device's virtual configuration space to keep track of 
54796 + * certain registers (like the base address registers (BARs) so that we can
54797 + * keep the client from manipulating them directly.
54798 + */
54799 +int pciback_config_init(struct pci_dev *dev)
54800 +{
54801 +       int err = 0;
54802 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54803 +
54804 +       INIT_LIST_HEAD(&dev_data->config_fields);
54805 +
54806 +       err = pciback_config_header_add_fields(dev);
54807 +
54808 +       return err;
54809 +}
54810 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/conf_space.h tmp-linux-2.6-xen.patch/drivers/xen/pciback/conf_space.h
54811 --- ref-linux-2.6.16.9/drivers/xen/pciback/conf_space.h 1970-01-01 01:00:00.000000000 +0100
54812 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/conf_space.h    2006-04-10 00:05:52.000000000 +0200
54813 @@ -0,0 +1,97 @@
54814 +/*
54815 + * PCI Backend - Common data structures for overriding the configuration space
54816 + *
54817 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
54818 + */
54819 +
54820 +#ifndef __XEN_PCIBACK_CONF_SPACE_H__
54821 +#define __XEN_PCIBACK_CONF_SPACE_H__
54822 +
54823 +#include <linux/list.h>
54824 +
54825 +typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
54826 +typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
54827 +typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
54828 +
54829 +typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
54830 +                                void *data);
54831 +typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
54832 +                               void *data);
54833 +typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
54834 +                               void *data);
54835 +typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
54836 +                               void *data);
54837 +typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
54838 +                              void *data);
54839 +typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
54840 +                              void *data);
54841 +
54842 +/* These are the fields within the configuration space which we
54843 + * are interested in intercepting reads/writes to and changing their
54844 + * values.
54845 + */
54846 +struct config_field {
54847 +       unsigned int     offset;
54848 +       unsigned int     size;
54849 +       conf_field_init  init;
54850 +       conf_field_reset reset;
54851 +       conf_field_free  release;
54852 +       union {
54853 +               struct {
54854 +                       conf_dword_write write;
54855 +                       conf_dword_read read;
54856 +               } dw;
54857 +               struct {
54858 +                       conf_word_write write;
54859 +                       conf_word_read read;
54860 +               } w;
54861 +               struct {
54862 +                       conf_byte_write write;
54863 +                       conf_byte_read read;
54864 +               } b;
54865 +       } u;
54866 +};
54867 +
54868 +struct config_field_entry {
54869 +       struct list_head list;
54870 +       struct config_field *field;
54871 +       void *data;
54872 +};
54873 +
54874 +/* Add fields to a device - the add_fields macro expects to get a pointer to
54875 + * the first entry in an array (of which the ending is marked by size==0)
54876 + */
54877 +int pciback_config_add_field(struct pci_dev *dev, struct config_field *field);
54878 +static inline int pciback_config_add_fields(struct pci_dev *dev,
54879 +                                           struct config_field *field)
54880 +{
54881 +       int i, err = 0;
54882 +       for (i = 0; field[i].size != 0; i++) {
54883 +               err = pciback_config_add_field(dev, &field[i]);
54884 +               if (err)
54885 +                       break;
54886 +       }
54887 +       return err;
54888 +}
54889 +
54890 +/* Initializers which add fields to the virtual configuration space
54891 + * ** We could add initializers to allow a guest domain to touch
54892 + * the capability lists (for power management, the AGP bridge, etc.)
54893 + */
54894 +int pciback_config_header_add_fields(struct pci_dev *dev);
54895 +
54896 +/* Read/Write the real configuration space */
54897 +int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
54898 +                            void *data);
54899 +int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
54900 +                            void *data);
54901 +int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
54902 +                             void *data);
54903 +int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
54904 +                             void *data);
54905 +int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
54906 +                             void *data);
54907 +int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
54908 +                              void *data);
54909 +
54910 +#endif                         /* __XEN_PCIBACK_CONF_SPACE_H__ */
54911 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/conf_space_header.c tmp-linux-2.6-xen.patch/drivers/xen/pciback/conf_space_header.c
54912 --- ref-linux-2.6.16.9/drivers/xen/pciback/conf_space_header.c  1970-01-01 01:00:00.000000000 +0100
54913 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/conf_space_header.c     2006-04-10 00:05:52.000000000 +0200
54914 @@ -0,0 +1,267 @@
54915 +/*
54916 + * PCI Backend - Handles the virtual fields in the configuration space headers.
54917 + *
54918 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
54919 + */
54920 +
54921 +#include <linux/kernel.h>
54922 +#include <linux/pci.h>
54923 +#include "pciback.h"
54924 +#include "conf_space.h"
54925 +
54926 +struct pci_bar_info {
54927 +       u32 val;
54928 +       u32 len_val;
54929 +       int which;
54930 +};
54931 +
54932 +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
54933 +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
54934 +
54935 +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
54936 +{
54937 +       if (!dev->is_enabled && is_enable_cmd(value)) {
54938 +               if (unlikely(verbose_request))
54939 +                       printk(KERN_DEBUG "pciback: %s: enable\n",
54940 +                              pci_name(dev));
54941 +               pci_enable_device(dev);
54942 +       } else if (dev->is_enabled && !is_enable_cmd(value)) {
54943 +               if (unlikely(verbose_request))
54944 +                       printk(KERN_DEBUG "pciback: %s: disable\n",
54945 +                              pci_name(dev));
54946 +               pci_disable_device(dev);
54947 +       }
54948 +
54949 +       if (!dev->is_busmaster && is_master_cmd(value)) {
54950 +               if (unlikely(verbose_request))
54951 +                       printk(KERN_DEBUG "pciback: %s: set bus master\n",
54952 +                              pci_name(dev));
54953 +               pci_set_master(dev);
54954 +       }
54955 +
54956 +       if (value & PCI_COMMAND_INVALIDATE) {
54957 +               if (unlikely(verbose_request))
54958 +                       printk(KERN_DEBUG
54959 +                              "pciback: %s: enable memory-write-invalidate\n",
54960 +                              pci_name(dev));
54961 +               pci_set_mwi(dev);
54962 +       }
54963 +
54964 +       return pci_write_config_word(dev, offset, value);
54965 +}
54966 +
54967 +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
54968 +{
54969 +       struct pci_bar_info *bar = data;
54970 +
54971 +       if (unlikely(!bar)) {
54972 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
54973 +                      pci_name(dev));
54974 +               return XEN_PCI_ERR_op_failed;
54975 +       }
54976 +
54977 +       /* A write to obtain the length must happen as a 32-bit write.
54978 +        * This does not (yet) support writing individual bytes
54979 +        */
54980 +       if (value == ~PCI_ROM_ADDRESS_ENABLE)
54981 +               bar->which = 1;
54982 +       else
54983 +               bar->which = 0;
54984 +
54985 +       /* Do we need to support enabling/disabling the rom address here? */
54986 +
54987 +       return 0;
54988 +}
54989 +
54990 +/* For the BARs, only allow writes which write ~0 or
54991 + * the correct resource information
54992 + * (Needed for when the driver probes the resource usage)
54993 + */
54994 +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
54995 +{
54996 +       struct pci_bar_info *bar = data;
54997 +
54998 +       if (unlikely(!bar)) {
54999 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
55000 +                      pci_name(dev));
55001 +               return XEN_PCI_ERR_op_failed;
55002 +       }
55003 +
55004 +       /* A write to obtain the length must happen as a 32-bit write.
55005 +        * This does not (yet) support writing individual bytes
55006 +        */
55007 +       if (value == ~0)
55008 +               bar->which = 1;
55009 +       else
55010 +               bar->which = 0;
55011 +
55012 +       return 0;
55013 +}
55014 +
55015 +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
55016 +{
55017 +       struct pci_bar_info *bar = data;
55018 +
55019 +       if (unlikely(!bar)) {
55020 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
55021 +                      pci_name(dev));
55022 +               return XEN_PCI_ERR_op_failed;
55023 +       }
55024 +
55025 +       *value = bar->which ? bar->len_val : bar->val;
55026 +
55027 +       return 0;
55028 +}
55029 +
55030 +static inline void read_dev_bar(struct pci_dev *dev,
55031 +                               struct pci_bar_info *bar_info, int offset,
55032 +                               u32 len_mask)
55033 +{
55034 +       pci_read_config_dword(dev, offset, &bar_info->val);
55035 +       pci_write_config_dword(dev, offset, len_mask);
55036 +       pci_read_config_dword(dev, offset, &bar_info->len_val);
55037 +       pci_write_config_dword(dev, offset, bar_info->val);
55038 +}
55039 +
55040 +static void *bar_init(struct pci_dev *dev, int offset)
55041 +{
55042 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
55043 +
55044 +       if (!bar)
55045 +               return ERR_PTR(-ENOMEM);
55046 +
55047 +       read_dev_bar(dev, bar, offset, ~0);
55048 +       bar->which = 0;
55049 +
55050 +       return bar;
55051 +}
55052 +
55053 +static void *rom_init(struct pci_dev *dev, int offset)
55054 +{
55055 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
55056 +
55057 +       if (!bar)
55058 +               return ERR_PTR(-ENOMEM);
55059 +
55060 +       read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
55061 +       bar->which = 0;
55062 +
55063 +       return bar;
55064 +}
55065 +
55066 +static void bar_reset(struct pci_dev *dev, int offset, void *data)
55067 +{
55068 +       struct pci_bar_info *bar = data;
55069 +
55070 +       bar->which = 0;
55071 +}
55072 +
55073 +static void bar_release(struct pci_dev *dev, int offset, void *data)
55074 +{
55075 +       kfree(data);
55076 +}
55077 +
55078 +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
55079 +                         void *data)
55080 +{
55081 +       *value = (u8) dev->irq;
55082 +
55083 +       return 0;
55084 +}
55085 +
55086 +struct config_field header_common[] = {
55087 +       {
55088 +        .offset    = PCI_COMMAND,
55089 +        .size      = 2,
55090 +        .u.w.read  = pciback_read_config_word,
55091 +        .u.w.write = command_write,
55092 +        },
55093 +       {
55094 +        .offset    = PCI_INTERRUPT_LINE,
55095 +        .size      = 1,
55096 +        .u.b.read  = interrupt_read,
55097 +        .u.b.write = NULL,
55098 +        },
55099 +       {
55100 +        /* Any side effects of letting driver domain control cache line? */
55101 +        .offset    = PCI_CACHE_LINE_SIZE,
55102 +        .size      = 1,
55103 +        .u.b.read  = pciback_read_config_byte,
55104 +        .u.b.write = pciback_write_config_byte,
55105 +        },
55106 +       {
55107 +        .size = 0,
55108 +        },
55109 +};
55110 +
55111 +#define CFG_FIELD_BAR(reg_offset)                      \
55112 +       {                                               \
55113 +        .offset     = reg_offset,                      \
55114 +        .size       = 4,                               \
55115 +        .init       = bar_init,                        \
55116 +        .reset      = bar_reset,                       \
55117 +        .release    = bar_release,                     \
55118 +        .u.dw.read  = bar_read,                        \
55119 +        .u.dw.write = bar_write,                       \
55120 +        }
55121 +
55122 +#define CFG_FIELD_ROM(reg_offset)                      \
55123 +       {                                               \
55124 +        .offset     = reg_offset,                      \
55125 +        .size       = 4,                               \
55126 +        .init       = rom_init,                        \
55127 +        .reset      = bar_reset,                       \
55128 +        .release    = bar_release,                     \
55129 +        .u.dw.read  = bar_read,                        \
55130 +        .u.dw.write = rom_write,                       \
55131 +        }
55132 +
55133 +struct config_field header_0[] = {
55134 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
55135 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
55136 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
55137 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
55138 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
55139 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
55140 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS),
55141 +       {
55142 +        .size = 0,
55143 +        },
55144 +};
55145 +
55146 +struct config_field header_1[] = {
55147 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
55148 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
55149 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
55150 +       {
55151 +        .size = 0,
55152 +        },
55153 +};
55154 +
55155 +int pciback_config_header_add_fields(struct pci_dev *dev)
55156 +{
55157 +       int err;
55158 +
55159 +       err = pciback_config_add_fields(dev, header_common);
55160 +       if (err)
55161 +               goto out;
55162 +
55163 +       switch (dev->hdr_type) {
55164 +       case PCI_HEADER_TYPE_NORMAL:
55165 +               err = pciback_config_add_fields(dev, header_0);
55166 +               break;
55167 +
55168 +       case PCI_HEADER_TYPE_BRIDGE:
55169 +               err = pciback_config_add_fields(dev, header_1);
55170 +               break;
55171 +
55172 +       default:
55173 +               err = -EINVAL;
55174 +               printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
55175 +                      pci_name(dev), dev->hdr_type);
55176 +               break;
55177 +       }
55178 +
55179 +      out:
55180 +       return err;
55181 +}
55182 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/Makefile tmp-linux-2.6-xen.patch/drivers/xen/pciback/Makefile
55183 --- ref-linux-2.6.16.9/drivers/xen/pciback/Makefile     1970-01-01 01:00:00.000000000 +0100
55184 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/Makefile        2006-04-10 00:05:52.000000000 +0200
55185 @@ -0,0 +1,10 @@
55186 +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
55187 +
55188 +pciback-y := pci_stub.o pciback_ops.o xenbus.o
55189 +pciback-y += conf_space.o conf_space_header.o
55190 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
55191 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
55192 +
55193 +ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
55194 +EXTRA_CFLAGS += -DDEBUG
55195 +endif
55196 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/passthrough.c tmp-linux-2.6-xen.patch/drivers/xen/pciback/passthrough.c
55197 --- ref-linux-2.6.16.9/drivers/xen/pciback/passthrough.c        1970-01-01 01:00:00.000000000 +0100
55198 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/passthrough.c   2006-04-10 00:05:52.000000000 +0200
55199 @@ -0,0 +1,157 @@
55200 +/*
55201 + * PCI Backend - Provides restricted access to the real PCI bus topology
55202 + *               to the frontend
55203 + *
55204 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
55205 + */
55206 +
55207 +#include <linux/list.h>
55208 +#include <linux/pci.h>
55209 +#include <linux/spinlock.h>
55210 +#include "pciback.h"
55211 +
55212 +struct passthrough_dev_data {
55213 +       /* Access to dev_list must be protected by lock */
55214 +       struct list_head dev_list;
55215 +       spinlock_t lock;
55216 +};
55217 +
55218 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
55219 +                                   unsigned int domain, unsigned int bus,
55220 +                                   unsigned int devfn)
55221 +{
55222 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55223 +       struct pci_dev_entry *dev_entry;
55224 +       struct pci_dev *dev = NULL;
55225 +       unsigned long flags;
55226 +
55227 +       spin_lock_irqsave(&dev_data->lock, flags);
55228 +
55229 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
55230 +               if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
55231 +                   && bus == (unsigned int)dev_entry->dev->bus->number
55232 +                   && devfn == dev_entry->dev->devfn) {
55233 +                       dev = dev_entry->dev;
55234 +                       break;
55235 +               }
55236 +       }
55237 +
55238 +       spin_unlock_irqrestore(&dev_data->lock, flags);
55239 +
55240 +       return dev;
55241 +}
55242 +
55243 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
55244 +{
55245 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55246 +       struct pci_dev_entry *dev_entry;
55247 +       unsigned long flags;
55248 +
55249 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
55250 +       if (!dev_entry)
55251 +               return -ENOMEM;
55252 +       dev_entry->dev = dev;
55253 +
55254 +       spin_lock_irqsave(&dev_data->lock, flags);
55255 +       list_add_tail(&dev_entry->list, &dev_data->dev_list);
55256 +       spin_unlock_irqrestore(&dev_data->lock, flags);
55257 +
55258 +       return 0;
55259 +}
55260 +
55261 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
55262 +{
55263 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55264 +       struct pci_dev_entry *dev_entry, *t;
55265 +       struct pci_dev *found_dev = NULL;
55266 +       unsigned long flags;
55267 +
55268 +       spin_lock_irqsave(&dev_data->lock, flags);
55269 +
55270 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
55271 +               if (dev_entry->dev == dev) {
55272 +                       list_del(&dev_entry->list);
55273 +                       found_dev = dev_entry->dev;
55274 +                       kfree(dev_entry);
55275 +               }
55276 +       }
55277 +
55278 +       spin_unlock_irqrestore(&dev_data->lock, flags);
55279 +
55280 +       if (found_dev)
55281 +               pcistub_put_pci_dev(found_dev);
55282 +}
55283 +
55284 +int pciback_init_devices(struct pciback_device *pdev)
55285 +{
55286 +       struct passthrough_dev_data *dev_data;
55287 +
55288 +       dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
55289 +       if (!dev_data)
55290 +               return -ENOMEM;
55291 +
55292 +       spin_lock_init(&dev_data->lock);
55293 +
55294 +       INIT_LIST_HEAD(&dev_data->dev_list);
55295 +
55296 +       pdev->pci_dev_data = dev_data;
55297 +
55298 +       return 0;
55299 +}
55300 +
55301 +int pciback_publish_pci_roots(struct pciback_device *pdev,
55302 +                             publish_pci_root_cb publish_root_cb)
55303 +{
55304 +       int err = 0;
55305 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55306 +       struct pci_dev_entry *dev_entry, *e;
55307 +       struct pci_dev *dev;
55308 +       int found;
55309 +       unsigned int domain, bus;
55310 +
55311 +       spin_lock(&dev_data->lock);
55312 +
55313 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
55314 +               /* Only publish this device as a root if none of its
55315 +                * parent bridges are exported
55316 +                */
55317 +               found = 0;
55318 +               dev = dev_entry->dev->bus->self;
55319 +               for (; !found && dev != NULL; dev = dev->bus->self) {
55320 +                       list_for_each_entry(e, &dev_data->dev_list, list) {
55321 +                               if (dev == e->dev) {
55322 +                                       found = 1;
55323 +                                       break;
55324 +                               }
55325 +                       }
55326 +               }
55327 +
55328 +               domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
55329 +               bus = (unsigned int)dev_entry->dev->bus->number;
55330 +
55331 +               if (!found) {
55332 +                       err = publish_root_cb(pdev, domain, bus);
55333 +                       if (err)
55334 +                               break;
55335 +               }
55336 +       }
55337 +
55338 +       spin_unlock(&dev_data->lock);
55339 +
55340 +       return err;
55341 +}
55342 +
55343 +void pciback_release_devices(struct pciback_device *pdev)
55344 +{
55345 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55346 +       struct pci_dev_entry *dev_entry, *t;
55347 +
55348 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
55349 +               list_del(&dev_entry->list);
55350 +               pcistub_put_pci_dev(dev_entry->dev);
55351 +               kfree(dev_entry);
55352 +       }
55353 +
55354 +       kfree(dev_data);
55355 +       pdev->pci_dev_data = NULL;
55356 +}
55357 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/pciback.h tmp-linux-2.6-xen.patch/drivers/xen/pciback/pciback.h
55358 --- ref-linux-2.6.16.9/drivers/xen/pciback/pciback.h    1970-01-01 01:00:00.000000000 +0100
55359 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/pciback.h       2006-04-10 00:05:52.000000000 +0200
55360 @@ -0,0 +1,78 @@
55361 +/*
55362 + * PCI Backend Common Data Structures & Function Declarations
55363 + *
55364 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
55365 + */
55366 +#ifndef __XEN_PCIBACK_H__
55367 +#define __XEN_PCIBACK_H__
55368 +
55369 +#include <linux/pci.h>
55370 +#include <linux/interrupt.h>
55371 +#include <xen/xenbus.h>
55372 +#include <linux/list.h>
55373 +#include <linux/spinlock.h>
55374 +#include <xen/interface/io/pciif.h>
55375 +
55376 +struct pci_dev_entry {
55377 +       struct list_head list;
55378 +       struct pci_dev *dev;
55379 +};
55380 +
55381 +struct pciback_device {
55382 +       void *pci_dev_data;
55383 +       spinlock_t dev_lock;
55384 +
55385 +       struct xenbus_device *xdev;
55386 +
55387 +       struct xenbus_watch be_watch;
55388 +       u8 be_watching;
55389 +
55390 +       int evtchn_irq;
55391 +
55392 +       struct xen_pci_sharedinfo *sh_info;
55393 +};
55394 +
55395 +struct pciback_dev_data {
55396 +       struct list_head config_fields;
55397 +};
55398 +
55399 +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
55400 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
55401 +                                           int domain, int bus,
55402 +                                           int slot, int func);
55403 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
55404 +                                   struct pci_dev *dev);
55405 +void pcistub_put_pci_dev(struct pci_dev *dev);
55406 +
55407 +/* Ensure a device is turned off or reset */
55408 +void pciback_reset_device(struct pci_dev *pdev);
55409 +
55410 +/* Access a virtual configuration space for a PCI device */
55411 +int pciback_config_init(struct pci_dev *dev);
55412 +void pciback_config_reset(struct pci_dev *dev);
55413 +void pciback_config_free(struct pci_dev *dev);
55414 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
55415 +                       u32 * ret_val);
55416 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
55417 +
55418 +/* Handle requests for specific devices from the frontend */
55419 +typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
55420 +                                   unsigned int domain, unsigned int bus);
55421 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
55422 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
55423 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
55424 +                                   unsigned int domain, unsigned int bus,
55425 +                                   unsigned int devfn);
55426 +int pciback_init_devices(struct pciback_device *pdev);
55427 +int pciback_publish_pci_roots(struct pciback_device *pdev,
55428 +                             publish_pci_root_cb cb);
55429 +void pciback_release_devices(struct pciback_device *pdev);
55430 +
55431 +/* Handles events from front-end */
55432 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
55433 +
55434 +int pciback_xenbus_register(void);
55435 +void pciback_xenbus_unregister(void);
55436 +
55437 +extern int verbose_request;
55438 +#endif
55439 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/pciback_ops.c tmp-linux-2.6-xen.patch/drivers/xen/pciback/pciback_ops.c
55440 --- ref-linux-2.6.16.9/drivers/xen/pciback/pciback_ops.c        1970-01-01 01:00:00.000000000 +0100
55441 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/pciback_ops.c   2006-04-10 00:05:52.000000000 +0200
55442 @@ -0,0 +1,74 @@
55443 +/*
55444 + * PCI Backend Operations - respond to PCI requests from Frontend
55445 + *
55446 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
55447 + */
55448 +#include <linux/module.h>
55449 +#include <asm/bitops.h>
55450 +#include <xen/evtchn.h>
55451 +#include "pciback.h"
55452 +
55453 +int verbose_request = 0;
55454 +module_param(verbose_request, int, 0644);
55455 +
55456 +/* Ensure a device is "turned off" and ready to be exported.
55457 + * (Also see pciback_config_reset to ensure virtual configuration space is
55458 + * ready to be re-exported)
55459 + */
55460 +void pciback_reset_device(struct pci_dev *dev)
55461 +{
55462 +       u16 cmd;
55463 +
55464 +       /* Disable devices (but not bridges) */
55465 +       if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
55466 +               pci_disable_device(dev);
55467 +
55468 +               pci_write_config_word(dev, PCI_COMMAND, 0);
55469 +
55470 +               dev->is_enabled = 0;
55471 +               dev->is_busmaster = 0;
55472 +       } else {
55473 +               pci_read_config_word(dev, PCI_COMMAND, &cmd);
55474 +               if (cmd & (PCI_COMMAND_INVALIDATE)) {
55475 +                       cmd &= ~(PCI_COMMAND_INVALIDATE);
55476 +                       pci_write_config_word(dev, PCI_COMMAND, cmd);
55477 +
55478 +                       dev->is_busmaster = 0;
55479 +               }
55480 +       }
55481 +
55482 +       pciback_config_reset(dev);
55483 +}
55484 +
55485 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
55486 +{
55487 +       struct pciback_device *pdev = dev_id;
55488 +       struct pci_dev *dev;
55489 +       struct xen_pci_op *op = &pdev->sh_info->op;
55490 +
55491 +       if (unlikely(!test_bit(_XEN_PCIF_active,
55492 +                              (unsigned long *)&pdev->sh_info->flags))) {
55493 +               pr_debug("pciback: interrupt, but no active operation\n");
55494 +               goto out;
55495 +       }
55496 +
55497 +       dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
55498 +
55499 +       if (dev == NULL)
55500 +               op->err = XEN_PCI_ERR_dev_not_found;
55501 +       else if (op->cmd == XEN_PCI_OP_conf_read)
55502 +               op->err = pciback_config_read(dev, op->offset, op->size,
55503 +                                             &op->value);
55504 +       else if (op->cmd == XEN_PCI_OP_conf_write)
55505 +               op->err = pciback_config_write(dev, op->offset, op->size,
55506 +                                              op->value);
55507 +       else
55508 +               op->err = XEN_PCI_ERR_not_implemented;
55509 +
55510 +       wmb();
55511 +       clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
55512 +       notify_remote_via_irq(pdev->evtchn_irq);
55513 +
55514 +      out:
55515 +       return IRQ_HANDLED;
55516 +}
55517 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/pci_stub.c tmp-linux-2.6-xen.patch/drivers/xen/pciback/pci_stub.c
55518 --- ref-linux-2.6.16.9/drivers/xen/pciback/pci_stub.c   1970-01-01 01:00:00.000000000 +0100
55519 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/pci_stub.c      2006-04-10 00:05:52.000000000 +0200
55520 @@ -0,0 +1,695 @@
55521 +/*
55522 + * PCI Stub Driver - Grabs devices in backend to be exported later
55523 + *
55524 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
55525 + */
55526 +#include <linux/module.h>
55527 +#include <linux/init.h>
55528 +#include <linux/list.h>
55529 +#include <linux/spinlock.h>
55530 +#include <linux/kref.h>
55531 +#include <asm/atomic.h>
55532 +#include "pciback.h"
55533 +
55534 +static char *pci_devs_to_hide = NULL;
55535 +module_param_named(hide, pci_devs_to_hide, charp, 0444);
55536 +
55537 +struct pcistub_device_id {
55538 +       struct list_head slot_list;
55539 +       int domain;
55540 +       unsigned char bus;
55541 +       unsigned int devfn;
55542 +};
55543 +static LIST_HEAD(pcistub_device_ids);
55544 +static DEFINE_SPINLOCK(device_ids_lock);
55545 +
55546 +struct pcistub_device {
55547 +       struct kref kref;
55548 +       struct list_head dev_list;
55549 +       spinlock_t lock;
55550 +
55551 +       struct pci_dev *dev;
55552 +       struct pciback_device *pdev;    /* non-NULL if struct pci_dev is in use */
55553 +};
55554 +/* Access to pcistub_devices & seized_devices lists and the initialize_devices
55555 + * flag must be locked with pcistub_devices_lock
55556 + */
55557 +static DEFINE_SPINLOCK(pcistub_devices_lock);
55558 +static LIST_HEAD(pcistub_devices);
55559 +
55560 +/* wait for device_initcall before initializing our devices
55561 + * (see pcistub_init_devices_late)
55562 + */
55563 +static int initialize_devices = 0;
55564 +static LIST_HEAD(seized_devices);
55565 +
55566 +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
55567 +{
55568 +       struct pcistub_device *psdev;
55569 +
55570 +       dev_dbg(&dev->dev, "pcistub_device_alloc\n");
55571 +
55572 +       psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
55573 +       if (!psdev)
55574 +               return NULL;
55575 +
55576 +       psdev->dev = pci_dev_get(dev);
55577 +       if (!psdev->dev) {
55578 +               kfree(psdev);
55579 +               return NULL;
55580 +       }
55581 +
55582 +       kref_init(&psdev->kref);
55583 +       spin_lock_init(&psdev->lock);
55584 +
55585 +       return psdev;
55586 +}
55587 +
55588 +/* Don't call this directly as it's called by pcistub_device_put */
55589 +static void pcistub_device_release(struct kref *kref)
55590 +{
55591 +       struct pcistub_device *psdev;
55592 +
55593 +       psdev = container_of(kref, struct pcistub_device, kref);
55594 +
55595 +       dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
55596 +
55597 +       /* Clean-up the device */
55598 +       pciback_reset_device(psdev->dev);
55599 +       pciback_config_free(psdev->dev);
55600 +       kfree(pci_get_drvdata(psdev->dev));
55601 +       pci_set_drvdata(psdev->dev, NULL);
55602 +
55603 +       pci_dev_put(psdev->dev);
55604 +
55605 +       kfree(psdev);
55606 +}
55607 +
55608 +static inline void pcistub_device_get(struct pcistub_device *psdev)
55609 +{
55610 +       kref_get(&psdev->kref);
55611 +}
55612 +
55613 +static inline void pcistub_device_put(struct pcistub_device *psdev)
55614 +{
55615 +       kref_put(&psdev->kref, pcistub_device_release);
55616 +}
55617 +
55618 +static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
55619 +                                                 struct pcistub_device *psdev)
55620 +{
55621 +       struct pci_dev *pci_dev = NULL;
55622 +       unsigned long flags;
55623 +
55624 +       pcistub_device_get(psdev);
55625 +
55626 +       spin_lock_irqsave(&psdev->lock, flags);
55627 +       if (!psdev->pdev) {
55628 +               psdev->pdev = pdev;
55629 +               pci_dev = psdev->dev;
55630 +       }
55631 +       spin_unlock_irqrestore(&psdev->lock, flags);
55632 +
55633 +       if (!pci_dev)
55634 +               pcistub_device_put(psdev);
55635 +
55636 +       return pci_dev;
55637 +}
55638 +
55639 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
55640 +                                           int domain, int bus,
55641 +                                           int slot, int func)
55642 +{
55643 +       struct pcistub_device *psdev;
55644 +       struct pci_dev *found_dev = NULL;
55645 +       unsigned long flags;
55646 +
55647 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55648 +
55649 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55650 +               if (psdev->dev != NULL
55651 +                   && domain == pci_domain_nr(psdev->dev->bus)
55652 +                   && bus == psdev->dev->bus->number
55653 +                   && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
55654 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
55655 +                       break;
55656 +               }
55657 +       }
55658 +
55659 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55660 +       return found_dev;
55661 +}
55662 +
55663 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
55664 +                                   struct pci_dev *dev)
55665 +{
55666 +       struct pcistub_device *psdev;
55667 +       struct pci_dev *found_dev = NULL;
55668 +       unsigned long flags;
55669 +
55670 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55671 +
55672 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55673 +               if (psdev->dev == dev) {
55674 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
55675 +                       break;
55676 +               }
55677 +       }
55678 +
55679 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55680 +       return found_dev;
55681 +}
55682 +
55683 +void pcistub_put_pci_dev(struct pci_dev *dev)
55684 +{
55685 +       struct pcistub_device *psdev, *found_psdev = NULL;
55686 +       unsigned long flags;
55687 +
55688 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55689 +
55690 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55691 +               if (psdev->dev == dev) {
55692 +                       found_psdev = psdev;
55693 +                       break;
55694 +               }
55695 +       }
55696 +
55697 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55698 +
55699 +       /* Cleanup our device
55700 +        * (so it's ready for the next domain)
55701 +        */
55702 +       pciback_reset_device(found_psdev->dev);
55703 +       pciback_config_reset(found_psdev->dev);
55704 +
55705 +       spin_lock_irqsave(&found_psdev->lock, flags);
55706 +       found_psdev->pdev = NULL;
55707 +       spin_unlock_irqrestore(&found_psdev->lock, flags);
55708 +
55709 +       pcistub_device_put(found_psdev);
55710 +}
55711 +
55712 +static int __devinit pcistub_match_one(struct pci_dev *dev,
55713 +                                      struct pcistub_device_id *pdev_id)
55714 +{
55715 +       /* Match the specified device by domain, bus, slot, func and also if
55716 +        * any of the device's parent bridges match.
55717 +        */
55718 +       for (; dev != NULL; dev = dev->bus->self) {
55719 +               if (pci_domain_nr(dev->bus) == pdev_id->domain
55720 +                   && dev->bus->number == pdev_id->bus
55721 +                   && dev->devfn == pdev_id->devfn)
55722 +                       return 1;
55723 +       }
55724 +
55725 +       return 0;
55726 +}
55727 +
55728 +static int __devinit pcistub_match(struct pci_dev *dev)
55729 +{
55730 +       struct pcistub_device_id *pdev_id;
55731 +       unsigned long flags;
55732 +       int found = 0;
55733 +
55734 +       spin_lock_irqsave(&device_ids_lock, flags);
55735 +       list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
55736 +               if (pcistub_match_one(dev, pdev_id)) {
55737 +                       found = 1;
55738 +                       break;
55739 +               }
55740 +       }
55741 +       spin_unlock_irqrestore(&device_ids_lock, flags);
55742 +
55743 +       return found;
55744 +}
55745 +
55746 +static int __devinit pcistub_init_device(struct pci_dev *dev)
55747 +{
55748 +       struct pciback_dev_data *dev_data;
55749 +       int err = 0;
55750 +
55751 +       dev_dbg(&dev->dev, "initializing...\n");
55752 +
55753 +       /* The PCI backend is not intended to be a module (or to work with
55754 +        * removable PCI devices (yet). If it were, pciback_config_free()
55755 +        * would need to be called somewhere to free the memory allocated
55756 +        * here and then to call kfree(pci_get_drvdata(psdev->dev)).
55757 +        */
55758 +       dev_data = kmalloc(sizeof(*dev_data), GFP_ATOMIC);
55759 +       if (!dev_data) {
55760 +               err = -ENOMEM;
55761 +               goto out;
55762 +       }
55763 +       pci_set_drvdata(dev, dev_data);
55764 +
55765 +       dev_dbg(&dev->dev, "initializing config\n");
55766 +       err = pciback_config_init(dev);
55767 +       if (err)
55768 +               goto out;
55769 +
55770 +       /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
55771 +        * must do this here because pcibios_enable_device may specify
55772 +        * the pci device's true irq (and possibly its other resources)
55773 +        * if they differ from what's in the configuration space.
55774 +        * This makes the assumption that the device's resources won't
55775 +        * change after this point (otherwise this code may break!)
55776 +        */
55777 +       dev_dbg(&dev->dev, "enabling device\n");
55778 +       err = pci_enable_device(dev);
55779 +       if (err)
55780 +               goto config_release;
55781 +
55782 +       /* Now disable the device (this also ensures some private device
55783 +        * data is setup before we export)
55784 +        */
55785 +       dev_dbg(&dev->dev, "reset device\n");
55786 +       pciback_reset_device(dev);
55787 +
55788 +       return 0;
55789 +
55790 +      config_release:
55791 +       pciback_config_free(dev);
55792 +
55793 +      out:
55794 +       pci_set_drvdata(dev, NULL);
55795 +       kfree(dev_data);
55796 +       return err;
55797 +}
55798 +
55799 +/*
55800 + * Because some initialization still happens on
55801 + * devices during fs_initcall, we need to defer
55802 + * full initialization of our devices until
55803 + * device_initcall.
55804 + */
55805 +static int __init pcistub_init_devices_late(void)
55806 +{
55807 +       struct pcistub_device *psdev;
55808 +       unsigned long flags;
55809 +       int err = 0;
55810 +
55811 +       pr_debug("pciback: pcistub_init_devices_late\n");
55812 +
55813 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55814 +
55815 +       while (!list_empty(&seized_devices)) {
55816 +               psdev = container_of(seized_devices.next,
55817 +                                    struct pcistub_device, dev_list);
55818 +               list_del(&psdev->dev_list);
55819 +
55820 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55821 +
55822 +               err = pcistub_init_device(psdev->dev);
55823 +               if (err) {
55824 +                       dev_err(&psdev->dev->dev,
55825 +                               "error %d initializing device\n", err);
55826 +                       kfree(psdev);
55827 +                       psdev = NULL;
55828 +               }
55829 +
55830 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
55831 +
55832 +               if (psdev)
55833 +                       list_add_tail(&psdev->dev_list, &pcistub_devices);
55834 +       }
55835 +
55836 +       initialize_devices = 1;
55837 +
55838 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55839 +
55840 +       return 0;
55841 +}
55842 +
55843 +static int __devinit pcistub_seize(struct pci_dev *dev)
55844 +{
55845 +       struct pcistub_device *psdev;
55846 +       unsigned long flags;
55847 +       int initialize_devices_copy;
55848 +       int err = 0;
55849 +
55850 +       psdev = pcistub_device_alloc(dev);
55851 +       if (!psdev)
55852 +               return -ENOMEM;
55853 +
55854 +       /* initialize_devices has to be accessed under a spin lock. But since
55855 +        * it can only change from 0 -> 1, if it's already 1, we don't have to
55856 +        * worry about it changing. That's why we can take a *copy* of
55857 +        * initialize_devices and wait till we're outside of the lock to
55858 +        * check if it's 1 (don't ever check if it's 0 outside of the lock)
55859 +        */
55860 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55861 +
55862 +       initialize_devices_copy = initialize_devices;
55863 +
55864 +       if (!initialize_devices_copy) {
55865 +               dev_dbg(&dev->dev, "deferring initialization\n");
55866 +               list_add(&psdev->dev_list, &seized_devices);
55867 +       }
55868 +
55869 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55870 +
55871 +       if (initialize_devices_copy) {
55872 +               /* don't want irqs disabled when calling pcistub_init_device */
55873 +               err = pcistub_init_device(psdev->dev);
55874 +               if (err)
55875 +                       goto out;
55876 +
55877 +               list_add(&psdev->dev_list, &pcistub_devices);
55878 +       }
55879 +
55880 +      out:
55881 +       if (err)
55882 +               pcistub_device_put(psdev);
55883 +
55884 +       return err;
55885 +}
55886 +
55887 +static int __devinit pcistub_probe(struct pci_dev *dev,
55888 +                                  const struct pci_device_id *id)
55889 +{
55890 +       int err = 0;
55891 +
55892 +       dev_dbg(&dev->dev, "probing...\n");
55893 +
55894 +       if (pcistub_match(dev)) {
55895 +
55896 +               if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
55897 +                   && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
55898 +                       dev_err(&dev->dev, "can't export pci devices that "
55899 +                               "don't have a normal (0) or bridge (1) "
55900 +                               "header type!\n");
55901 +                       err = -ENODEV;
55902 +                       goto out;
55903 +               }
55904 +
55905 +               dev_info(&dev->dev, "seizing device\n");
55906 +               err = pcistub_seize(dev);
55907 +       } else
55908 +               /* Didn't find the device */
55909 +               err = -ENODEV;
55910 +
55911 +      out:
55912 +       return err;
55913 +}
55914 +
55915 +static void pcistub_remove(struct pci_dev *dev)
55916 +{
55917 +       struct pcistub_device *psdev, *found_psdev = NULL;
55918 +       unsigned long flags;
55919 +
55920 +       dev_dbg(&dev->dev, "removing\n");
55921 +
55922 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55923 +
55924 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55925 +               if (psdev->dev == dev) {
55926 +                       found_psdev = psdev;
55927 +                       break;
55928 +               }
55929 +       }
55930 +
55931 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55932 +
55933 +       if (found_psdev) {
55934 +               dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
55935 +                       found_psdev->pdev);
55936 +
55937 +               if (found_psdev->pdev) {
55938 +                       printk(KERN_WARNING "pciback: ****** removing device "
55939 +                              "%s while still in-use! ******\n",
55940 +                              pci_name(found_psdev->dev));
55941 +                       printk(KERN_WARNING "pciback: ****** driver domain may "
55942 +                              "still access this device's i/o resources!\n");
55943 +                       printk(KERN_WARNING "pciback: ****** shutdown driver "
55944 +                              "domain before binding device\n");
55945 +                       printk(KERN_WARNING "pciback: ****** to other drivers "
55946 +                              "or domains\n");
55947 +
55948 +                       pciback_release_pci_dev(found_psdev->pdev,
55949 +                                               found_psdev->dev);
55950 +               }
55951 +
55952 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
55953 +               list_del(&found_psdev->dev_list);
55954 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55955 +
55956 +               /* the final put for releasing from the list */
55957 +               pcistub_device_put(found_psdev);
55958 +       }
55959 +}
55960 +
55961 +static struct pci_device_id pcistub_ids[] = {
55962 +       {
55963 +        .vendor = PCI_ANY_ID,
55964 +        .device = PCI_ANY_ID,
55965 +        .subvendor = PCI_ANY_ID,
55966 +        .subdevice = PCI_ANY_ID,
55967 +        },
55968 +       {0,},
55969 +};
55970 +
55971 +/*
55972 + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
55973 + * for a normal device. I don't want it to be loaded automatically.
55974 + */
55975 +
55976 +static struct pci_driver pciback_pci_driver = {
55977 +       .name = "pciback",
55978 +       .id_table = pcistub_ids,
55979 +       .probe = pcistub_probe,
55980 +       .remove = pcistub_remove,
55981 +};
55982 +
55983 +static inline int str_to_slot(const char *buf, int *domain, int *bus,
55984 +                             int *slot, int *func)
55985 +{
55986 +       int err;
55987 +
55988 +       err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
55989 +       if (err == 4)
55990 +               return 0;
55991 +       else if (err < 0)
55992 +               return -EINVAL;
55993 +
55994 +       /* try again without domain */
55995 +       *domain = 0;
55996 +       err = sscanf(buf, " %x:%x.%x", bus, slot, func);
55997 +       if (err == 3)
55998 +               return 0;
55999 +
56000 +       return -EINVAL;
56001 +}
56002 +
56003 +static int pcistub_device_id_add(int domain, int bus, int slot, int func)
56004 +{
56005 +       struct pcistub_device_id *pci_dev_id;
56006 +       unsigned long flags;
56007 +
56008 +       pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
56009 +       if (!pci_dev_id)
56010 +               return -ENOMEM;
56011 +
56012 +       pci_dev_id->domain = domain;
56013 +       pci_dev_id->bus = bus;
56014 +       pci_dev_id->devfn = PCI_DEVFN(slot, func);
56015 +
56016 +       pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
56017 +                domain, bus, slot, func);
56018 +
56019 +       spin_lock_irqsave(&device_ids_lock, flags);
56020 +       list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
56021 +       spin_unlock_irqrestore(&device_ids_lock, flags);
56022 +
56023 +       return 0;
56024 +}
56025 +
56026 +static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
56027 +{
56028 +       struct pcistub_device_id *pci_dev_id, *t;
56029 +       int devfn = PCI_DEVFN(slot, func);
56030 +       int err = -ENOENT;
56031 +       unsigned long flags;
56032 +
56033 +       spin_lock_irqsave(&device_ids_lock, flags);
56034 +       list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
56035 +
56036 +               if (pci_dev_id->domain == domain
56037 +                   && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
56038 +                       /* Don't break; here because it's possible the same
56039 +                        * slot could be in the list more than once
56040 +                        */
56041 +                       list_del(&pci_dev_id->slot_list);
56042 +                       kfree(pci_dev_id);
56043 +
56044 +                       err = 0;
56045 +
56046 +                       pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
56047 +                                "seize list\n", domain, bus, slot, func);
56048 +               }
56049 +       }
56050 +       spin_unlock_irqrestore(&device_ids_lock, flags);
56051 +
56052 +       return err;
56053 +}
56054 +
56055 +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
56056 +                               size_t count)
56057 +{
56058 +       int domain, bus, slot, func;
56059 +       int err;
56060 +
56061 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
56062 +       if (err)
56063 +               goto out;
56064 +
56065 +       err = pcistub_device_id_add(domain, bus, slot, func);
56066 +
56067 +      out:
56068 +       if (!err)
56069 +               err = count;
56070 +       return err;
56071 +}
56072 +
56073 +DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
56074 +
56075 +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
56076 +                                  size_t count)
56077 +{
56078 +       int domain, bus, slot, func;
56079 +       int err;
56080 +
56081 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
56082 +       if (err)
56083 +               goto out;
56084 +
56085 +       err = pcistub_device_id_remove(domain, bus, slot, func);
56086 +
56087 +      out:
56088 +       if (!err)
56089 +               err = count;
56090 +       return err;
56091 +}
56092 +
56093 +DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
56094 +
56095 +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
56096 +{
56097 +       struct pcistub_device_id *pci_dev_id;
56098 +       size_t count = 0;
56099 +       unsigned long flags;
56100 +
56101 +       spin_lock_irqsave(&device_ids_lock, flags);
56102 +       list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
56103 +               if (count >= PAGE_SIZE)
56104 +                       break;
56105 +
56106 +               count += scnprintf(buf + count, PAGE_SIZE - count,
56107 +                                  "%04x:%02x:%02x.%01x\n",
56108 +                                  pci_dev_id->domain, pci_dev_id->bus,
56109 +                                  PCI_SLOT(pci_dev_id->devfn),
56110 +                                  PCI_FUNC(pci_dev_id->devfn));
56111 +       }
56112 +       spin_unlock_irqrestore(&device_ids_lock, flags);
56113 +
56114 +       return count;
56115 +}
56116 +
56117 +DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
56118 +
56119 +static int __init pcistub_init(void)
56120 +{
56121 +       int pos = 0;
56122 +       int err = 0;
56123 +       int domain, bus, slot, func;
56124 +       int parsed;
56125 +
56126 +       if (pci_devs_to_hide && *pci_devs_to_hide) {
56127 +               do {
56128 +                       parsed = 0;
56129 +
56130 +                       err = sscanf(pci_devs_to_hide + pos,
56131 +                                    " (%x:%x:%x.%x) %n",
56132 +                                    &domain, &bus, &slot, &func, &parsed);
56133 +                       if (err != 4) {
56134 +                               domain = 0;
56135 +                               err = sscanf(pci_devs_to_hide + pos,
56136 +                                            " (%x:%x.%x) %n",
56137 +                                            &bus, &slot, &func, &parsed);
56138 +                               if (err != 3)
56139 +                                       goto parse_error;
56140 +                       }
56141 +
56142 +                       err = pcistub_device_id_add(domain, bus, slot, func);
56143 +                       if (err)
56144 +                               goto out;
56145 +
56146 +                       /* if parsed<=0, we've reached the end of the string */
56147 +                       pos += parsed;
56148 +               } while (parsed > 0 && pci_devs_to_hide[pos]);
56149 +       }
56150 +
56151 +       /* If we're the first PCI Device Driver to register, we're the
56152 +        * first one to get offered PCI devices as they become
56153 +        * available (and thus we can be the first to grab them)
56154 +        */
56155 +       err = pci_register_driver(&pciback_pci_driver);
56156 +       if (err < 0)
56157 +               goto out;
56158 +
56159 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
56160 +       driver_create_file(&pciback_pci_driver.driver,
56161 +                          &driver_attr_remove_slot);
56162 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_slots);
56163 +
56164 +      out:
56165 +       return err;
56166 +
56167 +      parse_error:
56168 +       printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
56169 +              pci_devs_to_hide + pos);
56170 +       return -EINVAL;
56171 +}
56172 +
56173 +#ifndef MODULE
56174 +/*
56175 + * fs_initcall happens before device_initcall
56176 + * so pciback *should* get called first (b/c we 
56177 + * want to suck up any device before other drivers
56178 + * get a chance by being the first pci device
56179 + * driver to register)
56180 + */
56181 +fs_initcall(pcistub_init);
56182 +#endif
56183 +
56184 +static int __init pciback_init(void)
56185 +{
56186 +#ifdef MODULE
56187 +       int err;
56188 +
56189 +       err = pcistub_init();
56190 +       if (err < 0)
56191 +               return err;
56192 +#endif
56193 +
56194 +       pcistub_init_devices_late();
56195 +       pciback_xenbus_register();
56196 +
56197 +       return 0;
56198 +}
56199 +
56200 +static void __exit pciback_cleanup(void)
56201 +{
56202 +       pciback_xenbus_unregister();
56203 +
56204 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
56205 +       driver_remove_file(&pciback_pci_driver.driver,
56206 +                          &driver_attr_remove_slot);
56207 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
56208 +
56209 +       pci_unregister_driver(&pciback_pci_driver);
56210 +}
56211 +
56212 +module_init(pciback_init);
56213 +module_exit(pciback_cleanup);
56214 +
56215 +MODULE_LICENSE("Dual BSD/GPL");
56216 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/vpci.c tmp-linux-2.6-xen.patch/drivers/xen/pciback/vpci.c
56217 --- ref-linux-2.6.16.9/drivers/xen/pciback/vpci.c       1970-01-01 01:00:00.000000000 +0100
56218 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/vpci.c  2006-04-10 00:05:52.000000000 +0200
56219 @@ -0,0 +1,204 @@
56220 +/*
56221 + * PCI Backend - Provides a Virtual PCI bus (with real devices)
56222 + *               to the frontend
56223 + *
56224 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56225 + */
56226 +
56227 +#include <linux/list.h>
56228 +#include <linux/slab.h>
56229 +#include <linux/pci.h>
56230 +#include <linux/spinlock.h>
56231 +#include "pciback.h"
56232 +
56233 +#define PCI_SLOT_MAX 32
56234 +
56235 +struct vpci_dev_data {
56236 +       /* Access to dev_list must be protected by lock */
56237 +       struct list_head dev_list[PCI_SLOT_MAX];
56238 +       spinlock_t lock;
56239 +};
56240 +
56241 +static inline struct list_head *list_first(struct list_head *head)
56242 +{
56243 +       return head->next;
56244 +}
56245 +
56246 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
56247 +                                   unsigned int domain, unsigned int bus,
56248 +                                   unsigned int devfn)
56249 +{
56250 +       struct pci_dev_entry *entry;
56251 +       struct pci_dev *dev = NULL;
56252 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56253 +       unsigned long flags;
56254 +
56255 +       if (domain != 0 || bus != 0)
56256 +               return NULL;
56257 +
56258 +       if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
56259 +               spin_lock_irqsave(&vpci_dev->lock, flags);
56260 +
56261 +               list_for_each_entry(entry,
56262 +                                   &vpci_dev->dev_list[PCI_SLOT(devfn)],
56263 +                                   list) {
56264 +                       if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
56265 +                               dev = entry->dev;
56266 +                               break;
56267 +                       }
56268 +               }
56269 +
56270 +               spin_unlock_irqrestore(&vpci_dev->lock, flags);
56271 +       }
56272 +       return dev;
56273 +}
56274 +
56275 +static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
56276 +{
56277 +       if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
56278 +           && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
56279 +               return 1;
56280 +
56281 +       return 0;
56282 +}
56283 +
56284 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
56285 +{
56286 +       int err = 0, slot;
56287 +       struct pci_dev_entry *t, *dev_entry;
56288 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56289 +       unsigned long flags;
56290 +
56291 +       if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
56292 +               err = -EFAULT;
56293 +               xenbus_dev_fatal(pdev->xdev, err,
56294 +                                "Can't export bridges on the virtual PCI bus");
56295 +               goto out;
56296 +       }
56297 +
56298 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
56299 +       if (!dev_entry) {
56300 +               err = -ENOMEM;
56301 +               xenbus_dev_fatal(pdev->xdev, err,
56302 +                                "Error adding entry to virtual PCI bus");
56303 +               goto out;
56304 +       }
56305 +
56306 +       dev_entry->dev = dev;
56307 +
56308 +       spin_lock_irqsave(&vpci_dev->lock, flags);
56309 +
56310 +       /* Keep multi-function devices together on the virtual PCI bus */
56311 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56312 +               if (!list_empty(&vpci_dev->dev_list[slot])) {
56313 +                       t = list_entry(list_first(&vpci_dev->dev_list[slot]),
56314 +                                      struct pci_dev_entry, list);
56315 +
56316 +                       if (match_slot(dev, t->dev)) {
56317 +                               pr_info("pciback: vpci: %s: "
56318 +                                       "assign to virtual slot %d func %d\n",
56319 +                                       pci_name(dev), slot,
56320 +                                       PCI_FUNC(dev->devfn));
56321 +                               list_add_tail(&dev_entry->list,
56322 +                                             &vpci_dev->dev_list[slot]);
56323 +                               goto unlock;
56324 +                       }
56325 +               }
56326 +       }
56327 +
56328 +       /* Assign to a new slot on the virtual PCI bus */
56329 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56330 +               if (list_empty(&vpci_dev->dev_list[slot])) {
56331 +                       printk(KERN_INFO
56332 +                              "pciback: vpci: %s: assign to virtual slot %d\n",
56333 +                              pci_name(dev), slot);
56334 +                       list_add_tail(&dev_entry->list,
56335 +                                     &vpci_dev->dev_list[slot]);
56336 +                       goto unlock;
56337 +               }
56338 +       }
56339 +
56340 +       err = -ENOMEM;
56341 +       xenbus_dev_fatal(pdev->xdev, err,
56342 +                        "No more space on root virtual PCI bus");
56343 +
56344 +      unlock:
56345 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
56346 +      out:
56347 +       return err;
56348 +}
56349 +
56350 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
56351 +{
56352 +       int slot;
56353 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56354 +       struct pci_dev *found_dev = NULL;
56355 +       unsigned long flags;
56356 +
56357 +       spin_lock_irqsave(&vpci_dev->lock, flags);
56358 +
56359 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56360 +               struct pci_dev_entry *e, *tmp;
56361 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
56362 +                                        list) {
56363 +                       if (e->dev == dev) {
56364 +                               list_del(&e->list);
56365 +                               found_dev = e->dev;
56366 +                               kfree(e);
56367 +                               goto out;
56368 +                       }
56369 +               }
56370 +       }
56371 +
56372 +      out:
56373 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
56374 +
56375 +       if (found_dev)
56376 +               pcistub_put_pci_dev(found_dev);
56377 +}
56378 +
56379 +int pciback_init_devices(struct pciback_device *pdev)
56380 +{
56381 +       int slot;
56382 +       struct vpci_dev_data *vpci_dev;
56383 +
56384 +       vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
56385 +       if (!vpci_dev)
56386 +               return -ENOMEM;
56387 +
56388 +       spin_lock_init(&vpci_dev->lock);
56389 +
56390 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56391 +               INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
56392 +       }
56393 +
56394 +       pdev->pci_dev_data = vpci_dev;
56395 +
56396 +       return 0;
56397 +}
56398 +
56399 +int pciback_publish_pci_roots(struct pciback_device *pdev,
56400 +                             publish_pci_root_cb publish_cb)
56401 +{
56402 +       /* The Virtual PCI bus has only one root */
56403 +       return publish_cb(pdev, 0, 0);
56404 +}
56405 +
56406 +void pciback_release_devices(struct pciback_device *pdev)
56407 +{
56408 +       int slot;
56409 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56410 +
56411 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56412 +               struct pci_dev_entry *e, *tmp;
56413 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
56414 +                                        list) {
56415 +                       list_del(&e->list);
56416 +                       pcistub_put_pci_dev(e->dev);
56417 +                       kfree(e);
56418 +               }
56419 +       }
56420 +
56421 +       kfree(vpci_dev);
56422 +       pdev->pci_dev_data = NULL;
56423 +}
56424 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pciback/xenbus.c tmp-linux-2.6-xen.patch/drivers/xen/pciback/xenbus.c
56425 --- ref-linux-2.6.16.9/drivers/xen/pciback/xenbus.c     1970-01-01 01:00:00.000000000 +0100
56426 +++ tmp-linux-2.6-xen.patch/drivers/xen/pciback/xenbus.c        2006-04-10 00:05:52.000000000 +0200
56427 @@ -0,0 +1,441 @@
56428 +/*
56429 + * PCI Backend Xenbus Setup - handles setup with frontend and xend
56430 + *
56431 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56432 + */
56433 +#include <linux/module.h>
56434 +#include <linux/init.h>
56435 +#include <linux/list.h>
56436 +#include <xen/xenbus.h>
56437 +#include <xen/evtchn.h>
56438 +#include "pciback.h"
56439 +
56440 +#define INVALID_EVTCHN_IRQ  (-1)
56441 +
56442 +static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
56443 +{
56444 +       struct pciback_device *pdev;
56445 +
56446 +       pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
56447 +       if (pdev == NULL)
56448 +               goto out;
56449 +       dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
56450 +
56451 +       pdev->xdev = xdev;
56452 +       xdev->data = pdev;
56453 +
56454 +       spin_lock_init(&pdev->dev_lock);
56455 +
56456 +       pdev->sh_info = NULL;
56457 +       pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
56458 +       pdev->be_watching = 0;
56459 +
56460 +       if (pciback_init_devices(pdev)) {
56461 +               kfree(pdev);
56462 +               pdev = NULL;
56463 +       }
56464 +      out:
56465 +       return pdev;
56466 +}
56467 +
56468 +static void free_pdev(struct pciback_device *pdev)
56469 +{
56470 +       if (pdev->be_watching)
56471 +               unregister_xenbus_watch(&pdev->be_watch);
56472 +
56473 +       /* Ensure the guest can't trigger our handler before removing devices */
56474 +       if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ)
56475 +               unbind_from_irqhandler(pdev->evtchn_irq, pdev);
56476 +
56477 +       if (pdev->sh_info)
56478 +               xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
56479 +
56480 +       pciback_release_devices(pdev);
56481 +
56482 +       pdev->xdev->data = NULL;
56483 +       pdev->xdev = NULL;
56484 +
56485 +       kfree(pdev);
56486 +}
56487 +
56488 +static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
56489 +                            int remote_evtchn)
56490 +{
56491 +       int err = 0;
56492 +       int evtchn;
56493 +       dev_dbg(&pdev->xdev->dev,
56494 +               "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
56495 +               gnt_ref, remote_evtchn);
56496 +
56497 +       err =
56498 +           xenbus_map_ring_valloc(pdev->xdev, gnt_ref,
56499 +                                  (void **)&pdev->sh_info);
56500 +       if (err)
56501 +               goto out;
56502 +
56503 +       err = xenbus_bind_evtchn(pdev->xdev, remote_evtchn, &evtchn);
56504 +       if (err)
56505 +               goto out;
56506 +
56507 +       err = bind_evtchn_to_irqhandler(evtchn, pciback_handle_event,
56508 +                                       SA_SAMPLE_RANDOM, "pciback", pdev);
56509 +       if (err < 0) {
56510 +               xenbus_dev_fatal(pdev->xdev, err,
56511 +                                "Error binding event channel to IRQ");
56512 +               goto out;
56513 +       }
56514 +       pdev->evtchn_irq = err;
56515 +       err = 0;
56516 +
56517 +       dev_dbg(&pdev->xdev->dev, "Attached!\n");
56518 +      out:
56519 +       return err;
56520 +}
56521 +
56522 +static int pciback_attach(struct pciback_device *pdev)
56523 +{
56524 +       int err = 0;
56525 +       int gnt_ref, remote_evtchn;
56526 +       char *magic = NULL;
56527 +
56528 +       spin_lock(&pdev->dev_lock);
56529 +
56530 +       /* Make sure we only do this setup once */
56531 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
56532 +           XenbusStateInitialised)
56533 +               goto out;
56534 +
56535 +       /* Wait for frontend to state that it has published the configuration */
56536 +       if (xenbus_read_driver_state(pdev->xdev->otherend) !=
56537 +           XenbusStateInitialised)
56538 +               goto out;
56539 +
56540 +       dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
56541 +
56542 +       err = xenbus_gather(XBT_NULL, pdev->xdev->otherend,
56543 +                           "pci-op-ref", "%u", &gnt_ref,
56544 +                           "event-channel", "%u", &remote_evtchn,
56545 +                           "magic", NULL, &magic, NULL);
56546 +       if (err) {
56547 +               /* If configuration didn't get read correctly, wait longer */
56548 +               xenbus_dev_fatal(pdev->xdev, err,
56549 +                                "Error reading configuration from frontend");
56550 +               goto out;
56551 +       }
56552 +
56553 +       if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
56554 +               xenbus_dev_fatal(pdev->xdev, -EFAULT,
56555 +                                "version mismatch (%s/%s) with pcifront - "
56556 +                                "halting pciback",
56557 +                                magic, XEN_PCI_MAGIC);
56558 +               goto out;
56559 +       }
56560 +
56561 +       err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
56562 +       if (err)
56563 +               goto out;
56564 +
56565 +       dev_dbg(&pdev->xdev->dev, "Connecting...\n");
56566 +
56567 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
56568 +       if (err)
56569 +               xenbus_dev_fatal(pdev->xdev, err,
56570 +                                "Error switching to connected state!");
56571 +
56572 +       dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
56573 +      out:
56574 +       spin_unlock(&pdev->dev_lock);
56575 +
56576 +       if (magic)
56577 +               kfree(magic);
56578 +
56579 +       return err;
56580 +}
56581 +
56582 +static void pciback_frontend_changed(struct xenbus_device *xdev,
56583 +                                    XenbusState fe_state)
56584 +{
56585 +       struct pciback_device *pdev = xdev->data;
56586 +
56587 +       dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
56588 +
56589 +       switch (fe_state) {
56590 +       case XenbusStateInitialised:
56591 +               pciback_attach(pdev);
56592 +               break;
56593 +
56594 +       case XenbusStateClosing:
56595 +               xenbus_switch_state(xdev, XenbusStateClosing);
56596 +               break;
56597 +
56598 +       case XenbusStateClosed:
56599 +               dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
56600 +               device_unregister(&xdev->dev);
56601 +               break;
56602 +
56603 +       default:
56604 +               break;
56605 +       }
56606 +}
56607 +
56608 +static int pciback_publish_pci_root(struct pciback_device *pdev,
56609 +                                   unsigned int domain, unsigned int bus)
56610 +{
56611 +       unsigned int d, b;
56612 +       int i, root_num, len, err;
56613 +       char str[64];
56614 +
56615 +       dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
56616 +
56617 +       err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename,
56618 +                          "root_num", "%d", &root_num);
56619 +       if (err == 0 || err == -ENOENT)
56620 +               root_num = 0;
56621 +       else if (err < 0)
56622 +               goto out;
56623 +
56624 +       /* Verify that we haven't already published this pci root */
56625 +       for (i = 0; i < root_num; i++) {
56626 +               len = snprintf(str, sizeof(str), "root-%d", i);
56627 +               if (unlikely(len >= (sizeof(str) - 1))) {
56628 +                       err = -ENOMEM;
56629 +                       goto out;
56630 +               }
56631 +
56632 +               err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename,
56633 +                                  str, "%x:%x", &d, &b);
56634 +               if (err < 0)
56635 +                       goto out;
56636 +               if (err != 2) {
56637 +                       err = -EINVAL;
56638 +                       goto out;
56639 +               }
56640 +
56641 +               if (d == domain && b == bus) {
56642 +                       err = 0;
56643 +                       goto out;
56644 +               }
56645 +       }
56646 +
56647 +       len = snprintf(str, sizeof(str), "root-%d", root_num);
56648 +       if (unlikely(len >= (sizeof(str) - 1))) {
56649 +               err = -ENOMEM;
56650 +               goto out;
56651 +       }
56652 +
56653 +       dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
56654 +               root_num, domain, bus);
56655 +
56656 +       err = xenbus_printf(XBT_NULL, pdev->xdev->nodename, str,
56657 +                           "%04x:%02x", domain, bus);
56658 +       if (err)
56659 +               goto out;
56660 +
56661 +       err = xenbus_printf(XBT_NULL, pdev->xdev->nodename,
56662 +                           "root_num", "%d", (root_num + 1));
56663 +
56664 +      out:
56665 +       return err;
56666 +}
56667 +
56668 +static int pciback_export_device(struct pciback_device *pdev,
56669 +                                int domain, int bus, int slot, int func)
56670 +{
56671 +       struct pci_dev *dev;
56672 +       int err = 0;
56673 +
56674 +       dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
56675 +               domain, bus, slot, func);
56676 +
56677 +       dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
56678 +       if (!dev) {
56679 +               err = -EINVAL;
56680 +               xenbus_dev_fatal(pdev->xdev, err,
56681 +                                "Couldn't locate PCI device "
56682 +                                "(%04x:%02x:%02x.%01x)! "
56683 +                                "perhaps already in-use?",
56684 +                                domain, bus, slot, func);
56685 +               goto out;
56686 +       }
56687 +
56688 +       err = pciback_add_pci_dev(pdev, dev);
56689 +       if (err)
56690 +               goto out;
56691 +
56692 +       /* TODO: It'd be nice to export a bridge and have all of its children
56693 +        * get exported with it. This may be best done in xend (which will
56694 +        * have to calculate resource usage anyway) but we probably want to
56695 +        * put something in here to ensure that if a bridge gets given to a
56696 +        * driver domain, that all devices under that bridge are not given
56697 +        * to other driver domains (as he who controls the bridge can disable
56698 +        * it and stop the other devices from working).
56699 +        */
56700 +      out:
56701 +       return err;
56702 +}
56703 +
56704 +static int pciback_setup_backend(struct pciback_device *pdev)
56705 +{
56706 +       /* Get configuration from xend (if available now) */
56707 +       int domain, bus, slot, func;
56708 +       int err = 0;
56709 +       int i, num_devs;
56710 +       char dev_str[64];
56711 +
56712 +       spin_lock(&pdev->dev_lock);
56713 +
56714 +       /* It's possible we could get the call to setup twice, so make sure
56715 +        * we're not already connected.
56716 +        */
56717 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
56718 +           XenbusStateInitWait)
56719 +               goto out;
56720 +
56721 +       dev_dbg(&pdev->xdev->dev, "getting be setup\n");
56722 +
56723 +       err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename, "num_devs", "%d",
56724 +                          &num_devs);
56725 +       if (err != 1) {
56726 +               if (err >= 0)
56727 +                       err = -EINVAL;
56728 +               xenbus_dev_fatal(pdev->xdev, err,
56729 +                                "Error reading number of devices");
56730 +               goto out;
56731 +       }
56732 +
56733 +       for (i = 0; i < num_devs; i++) {
56734 +               int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
56735 +               if (unlikely(l >= (sizeof(dev_str) - 1))) {
56736 +                       err = -ENOMEM;
56737 +                       xenbus_dev_fatal(pdev->xdev, err,
56738 +                                        "String overflow while reading "
56739 +                                        "configuration");
56740 +                       goto out;
56741 +               }
56742 +
56743 +               err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename, dev_str,
56744 +                                  "%x:%x:%x.%x", &domain, &bus, &slot, &func);
56745 +               if (err < 0) {
56746 +                       xenbus_dev_fatal(pdev->xdev, err,
56747 +                                        "Error reading device configuration");
56748 +                       goto out;
56749 +               }
56750 +               if (err != 4) {
56751 +                       err = -EINVAL;
56752 +                       xenbus_dev_fatal(pdev->xdev, err,
56753 +                                        "Error parsing pci device "
56754 +                                        "configuration");
56755 +                       goto out;
56756 +               }
56757 +
56758 +               err = pciback_export_device(pdev, domain, bus, slot, func);
56759 +               if (err)
56760 +                       goto out;
56761 +       }
56762 +
56763 +       err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
56764 +       if (err) {
56765 +               xenbus_dev_fatal(pdev->xdev, err,
56766 +                                "Error while publish PCI root buses "
56767 +                                "for frontend");
56768 +               goto out;
56769 +       }
56770 +
56771 +       err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
56772 +       if (err)
56773 +               xenbus_dev_fatal(pdev->xdev, err,
56774 +                                "Error switching to initialised state!");
56775 +
56776 +      out:
56777 +       spin_unlock(&pdev->dev_lock);
56778 +
56779 +       if (!err)
56780 +               /* see if pcifront is already configured (if not, we'll wait) */
56781 +               pciback_attach(pdev);
56782 +
56783 +       return err;
56784 +}
56785 +
56786 +static void pciback_be_watch(struct xenbus_watch *watch,
56787 +                            const char **vec, unsigned int len)
56788 +{
56789 +       struct pciback_device *pdev =
56790 +           container_of(watch, struct pciback_device, be_watch);
56791 +
56792 +       switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
56793 +       case XenbusStateInitWait:
56794 +               pciback_setup_backend(pdev);
56795 +               break;
56796 +
56797 +       default:
56798 +               break;
56799 +       }
56800 +}
56801 +
56802 +static int pciback_xenbus_probe(struct xenbus_device *dev,
56803 +                               const struct xenbus_device_id *id)
56804 +{
56805 +       int err = 0;
56806 +       struct pciback_device *pdev = alloc_pdev(dev);
56807 +
56808 +       if (pdev == NULL) {
56809 +               err = -ENOMEM;
56810 +               xenbus_dev_fatal(dev, err,
56811 +                                "Error allocating pciback_device struct");
56812 +               goto out;
56813 +       }
56814 +
56815 +       /* wait for xend to configure us */
56816 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
56817 +       if (err)
56818 +               goto out;
56819 +
56820 +       /* watch the backend node for backend configuration information */
56821 +       err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
56822 +                               pciback_be_watch);
56823 +       if (err)
56824 +               goto out;
56825 +       pdev->be_watching = 1;
56826 +
56827 +       /* We need to force a call to our callback here in case
56828 +        * xend already configured us!
56829 +        */
56830 +       pciback_be_watch(&pdev->be_watch, NULL, 0);
56831 +
56832 +      out:
56833 +       return err;
56834 +}
56835 +
56836 +static int pciback_xenbus_remove(struct xenbus_device *dev)
56837 +{
56838 +       struct pciback_device *pdev = dev->data;
56839 +
56840 +       if (pdev != NULL)
56841 +               free_pdev(pdev);
56842 +
56843 +       return 0;
56844 +}
56845 +
56846 +static struct xenbus_device_id xenpci_ids[] = {
56847 +       {"pci"},
56848 +       {{0}},
56849 +};
56850 +
56851 +static struct xenbus_driver xenbus_pciback_driver = {
56852 +       .name                   = "pciback",
56853 +       .owner                  = THIS_MODULE,
56854 +       .ids                    = xenpci_ids,
56855 +       .probe                  = pciback_xenbus_probe,
56856 +       .remove                 = pciback_xenbus_remove,
56857 +       .otherend_changed       = pciback_frontend_changed,
56858 +};
56859 +
56860 +int __init pciback_xenbus_register(void)
56861 +{
56862 +       return xenbus_register_backend(&xenbus_pciback_driver);
56863 +}
56864 +
56865 +void __exit pciback_xenbus_unregister(void)
56866 +{
56867 +       xenbus_unregister_driver(&xenbus_pciback_driver);
56868 +}
56869 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pcifront/Makefile tmp-linux-2.6-xen.patch/drivers/xen/pcifront/Makefile
56870 --- ref-linux-2.6.16.9/drivers/xen/pcifront/Makefile    1970-01-01 01:00:00.000000000 +0100
56871 +++ tmp-linux-2.6-xen.patch/drivers/xen/pcifront/Makefile       2006-04-10 00:05:52.000000000 +0200
56872 @@ -0,0 +1,7 @@
56873 +obj-y += pcifront.o
56874 +
56875 +pcifront-y := pci_op.o xenbus.o pci.o
56876 +
56877 +ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y)
56878 +EXTRA_CFLAGS += -DDEBUG
56879 +endif
56880 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pcifront/pci.c tmp-linux-2.6-xen.patch/drivers/xen/pcifront/pci.c
56881 --- ref-linux-2.6.16.9/drivers/xen/pcifront/pci.c       1970-01-01 01:00:00.000000000 +0100
56882 +++ tmp-linux-2.6-xen.patch/drivers/xen/pcifront/pci.c  2006-04-10 00:05:52.000000000 +0200
56883 @@ -0,0 +1,46 @@
56884 +/*
56885 + * PCI Frontend Operations - ensure only one PCI frontend runs at a time
56886 + *
56887 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56888 + */
56889 +#include <linux/module.h>
56890 +#include <linux/init.h>
56891 +#include <linux/pci.h>
56892 +#include <linux/spinlock.h>
56893 +#include "pcifront.h"
56894 +
56895 +DEFINE_SPINLOCK(pcifront_dev_lock);
56896 +static struct pcifront_device *pcifront_dev = NULL;
56897 +
56898 +int pcifront_connect(struct pcifront_device *pdev)
56899 +{
56900 +       int err = 0;
56901 +
56902 +       spin_lock(&pcifront_dev_lock);
56903 +
56904 +       if (!pcifront_dev) {
56905 +               dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
56906 +               pcifront_dev = pdev;
56907 +       }
56908 +       else {
56909 +               dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
56910 +               err = -EEXIST;
56911 +       }
56912 +
56913 +       spin_unlock(&pcifront_dev_lock);
56914 +
56915 +       return err;
56916 +}
56917 +
56918 +void pcifront_disconnect(struct pcifront_device *pdev)
56919 +{
56920 +       spin_lock(&pcifront_dev_lock);
56921 +
56922 +       if (pdev == pcifront_dev) {
56923 +               dev_info(&pdev->xdev->dev,
56924 +                        "Disconnecting PCI Frontend Buses\n");
56925 +               pcifront_dev = NULL;
56926 +       }
56927 +
56928 +       spin_unlock(&pcifront_dev_lock);
56929 +}
56930 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pcifront/pcifront.h tmp-linux-2.6-xen.patch/drivers/xen/pcifront/pcifront.h
56931 --- ref-linux-2.6.16.9/drivers/xen/pcifront/pcifront.h  1970-01-01 01:00:00.000000000 +0100
56932 +++ tmp-linux-2.6-xen.patch/drivers/xen/pcifront/pcifront.h     2006-04-10 00:05:52.000000000 +0200
56933 @@ -0,0 +1,40 @@
56934 +/*
56935 + * PCI Frontend - Common data structures & function declarations
56936 + *
56937 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56938 + */
56939 +#ifndef __XEN_PCIFRONT_H__
56940 +#define __XEN_PCIFRONT_H__
56941 +
56942 +#include <linux/spinlock.h>
56943 +#include <linux/pci.h>
56944 +#include <xen/xenbus.h>
56945 +#include <xen/interface/io/pciif.h>
56946 +#include <xen/pcifront.h>
56947 +
56948 +struct pci_bus_entry {
56949 +       struct list_head list;
56950 +       struct pci_bus *bus;
56951 +};
56952 +
56953 +struct pcifront_device {
56954 +       struct xenbus_device *xdev;
56955 +       struct list_head root_buses;
56956 +       spinlock_t dev_lock;
56957 +
56958 +       int evtchn;
56959 +       int gnt_ref;
56960 +
56961 +       /* Lock this when doing any operations in sh_info */
56962 +       spinlock_t sh_info_lock;
56963 +       struct xen_pci_sharedinfo *sh_info;
56964 +};
56965 +
56966 +int pcifront_connect(struct pcifront_device *pdev);
56967 +void pcifront_disconnect(struct pcifront_device *pdev);
56968 +
56969 +int pcifront_scan_root(struct pcifront_device *pdev,
56970 +                      unsigned int domain, unsigned int bus);
56971 +void pcifront_free_roots(struct pcifront_device *pdev);
56972 +
56973 +#endif /* __XEN_PCIFRONT_H__ */
56974 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pcifront/pci_op.c tmp-linux-2.6-xen.patch/drivers/xen/pcifront/pci_op.c
56975 --- ref-linux-2.6.16.9/drivers/xen/pcifront/pci_op.c    1970-01-01 01:00:00.000000000 +0100
56976 +++ tmp-linux-2.6-xen.patch/drivers/xen/pcifront/pci_op.c       2006-04-10 00:05:52.000000000 +0200
56977 @@ -0,0 +1,272 @@
56978 +/*
56979 + * PCI Frontend Operations - Communicates with frontend
56980 + *
56981 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56982 + */
56983 +#include <linux/module.h>
56984 +#include <linux/version.h>
56985 +#include <linux/init.h>
56986 +#include <linux/pci.h>
56987 +#include <linux/spinlock.h>
56988 +#include <linux/time.h>
56989 +#include <xen/evtchn.h>
56990 +#include "pcifront.h"
56991 +
56992 +static int verbose_request = 0;
56993 +module_param(verbose_request, int, 0644);
56994 +
56995 +static int errno_to_pcibios_err(int errno)
56996 +{
56997 +       switch (errno) {
56998 +       case XEN_PCI_ERR_success:
56999 +               return PCIBIOS_SUCCESSFUL;
57000 +
57001 +       case XEN_PCI_ERR_dev_not_found:
57002 +               return PCIBIOS_DEVICE_NOT_FOUND;
57003 +
57004 +       case XEN_PCI_ERR_invalid_offset:
57005 +       case XEN_PCI_ERR_op_failed:
57006 +               return PCIBIOS_BAD_REGISTER_NUMBER;
57007 +
57008 +       case XEN_PCI_ERR_not_implemented:
57009 +               return PCIBIOS_FUNC_NOT_SUPPORTED;
57010 +
57011 +       case XEN_PCI_ERR_access_denied:
57012 +               return PCIBIOS_SET_FAILED;
57013 +       }
57014 +       return errno;
57015 +}
57016 +
57017 +static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
57018 +{
57019 +       int err = 0;
57020 +       struct xen_pci_op *active_op = &pdev->sh_info->op;
57021 +       unsigned long irq_flags;
57022 +       evtchn_port_t port = pdev->evtchn;
57023 +       nsec_t ns, ns_timeout;
57024 +       struct timeval tv;
57025 +
57026 +       spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
57027 +
57028 +       memcpy(active_op, op, sizeof(struct xen_pci_op));
57029 +
57030 +       /* Go */
57031 +       wmb();
57032 +       set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
57033 +       notify_remote_via_evtchn(port);
57034 +
57035 +       /*
57036 +        * We set a poll timeout of 3 seconds but give up on return after
57037 +        * 2 seconds. It is better to time out too late rather than too early
57038 +        * (in the latter case we end up continually re-executing poll() with a
57039 +        * timeout in the past). 1s difference gives plenty of slack for error.
57040 +        */
57041 +       do_gettimeofday(&tv);
57042 +       ns_timeout = timeval_to_ns(&tv) + 2 * (nsec_t)NSEC_PER_SEC;
57043 +
57044 +       clear_evtchn(port);
57045 +
57046 +       while (test_bit(_XEN_PCIF_active,
57047 +                       (unsigned long *)&pdev->sh_info->flags)) {
57048 +               if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
57049 +                       BUG();
57050 +               clear_evtchn(port);
57051 +               do_gettimeofday(&tv);
57052 +               ns = timeval_to_ns(&tv);
57053 +               if (ns > ns_timeout) {
57054 +                       dev_err(&pdev->xdev->dev,
57055 +                               "pciback not responding!!!\n");
57056 +                       clear_bit(_XEN_PCIF_active,
57057 +                                 (unsigned long *)&pdev->sh_info->flags);
57058 +                       err = XEN_PCI_ERR_dev_not_found;
57059 +                       goto out;
57060 +               }
57061 +       }
57062 +
57063 +       memcpy(op, active_op, sizeof(struct xen_pci_op));
57064 +
57065 +       err = op->err;
57066 +      out:
57067 +       spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
57068 +       return err;
57069 +}
57070 +
57071 +/* Access to this function is spinlocked in drivers/pci/access.c */
57072 +static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
57073 +                            int where, int size, u32 * val)
57074 +{
57075 +       int err = 0;
57076 +       struct xen_pci_op op = {
57077 +               .cmd    = XEN_PCI_OP_conf_read,
57078 +               .domain = pci_domain_nr(bus),
57079 +               .bus    = bus->number,
57080 +               .devfn  = devfn,
57081 +               .offset = where,
57082 +               .size   = size,
57083 +       };
57084 +       struct pcifront_sd *sd = bus->sysdata;
57085 +       struct pcifront_device *pdev = sd->pdev;
57086 +
57087 +       if (verbose_request)
57088 +               dev_info(&pdev->xdev->dev,
57089 +                        "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
57090 +                        pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
57091 +                        PCI_FUNC(devfn), where, size);
57092 +
57093 +       err = do_pci_op(pdev, &op);
57094 +
57095 +       if (likely(!err)) {
57096 +               if (verbose_request)
57097 +                       dev_info(&pdev->xdev->dev, "read got back value %x\n",
57098 +                                op.value);
57099 +
57100 +               *val = op.value;
57101 +       } else if (err == -ENODEV) {
57102 +               /* No device here, pretend that it just returned 0 */
57103 +               err = 0;
57104 +               *val = 0;
57105 +       }
57106 +
57107 +       return errno_to_pcibios_err(err);
57108 +}
57109 +
57110 +/* Access to this function is spinlocked in drivers/pci/access.c */
57111 +static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
57112 +                             int where, int size, u32 val)
57113 +{
57114 +       struct xen_pci_op op = {
57115 +               .cmd    = XEN_PCI_OP_conf_write,
57116 +               .domain = pci_domain_nr(bus),
57117 +               .bus    = bus->number,
57118 +               .devfn  = devfn,
57119 +               .offset = where,
57120 +               .size   = size,
57121 +               .value  = val,
57122 +       };
57123 +       struct pcifront_sd *sd = bus->sysdata;
57124 +       struct pcifront_device *pdev = sd->pdev;
57125 +
57126 +       if (verbose_request)
57127 +               dev_info(&pdev->xdev->dev,
57128 +                        "write dev=%04x:%02x:%02x.%01x - "
57129 +                        "offset %x size %d val %x\n",
57130 +                        pci_domain_nr(bus), bus->number,
57131 +                        PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
57132 +
57133 +       return errno_to_pcibios_err(do_pci_op(pdev, &op));
57134 +}
57135 +
57136 +struct pci_ops pcifront_bus_ops = {
57137 +       .read = pcifront_bus_read,
57138 +       .write = pcifront_bus_write,
57139 +};
57140 +
57141 +/* Claim resources for the PCI frontend as-is, backend won't allow changes */
57142 +static void pcifront_claim_resource(struct pci_dev *dev, void *data)
57143 +{
57144 +       struct pcifront_device *pdev = data;
57145 +       int i;
57146 +       struct resource *r;
57147 +
57148 +       for (i = 0; i < PCI_NUM_RESOURCES; i++) {
57149 +               r = &dev->resource[i];
57150 +
57151 +               if (!r->parent && r->start && r->flags) {
57152 +                       dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
57153 +                               pci_name(dev), i);
57154 +                       pci_claim_resource(dev, i);
57155 +               }
57156 +       }
57157 +}
57158 +
57159 +int pcifront_scan_root(struct pcifront_device *pdev,
57160 +                      unsigned int domain, unsigned int bus)
57161 +{
57162 +       struct pci_bus *b;
57163 +       struct pcifront_sd *sd = NULL;
57164 +       struct pci_bus_entry *bus_entry = NULL;
57165 +       int err = 0;
57166 +
57167 +#ifndef CONFIG_PCI_DOMAINS
57168 +       if (domain != 0) {
57169 +               dev_err(&pdev->xdev->dev,
57170 +                       "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
57171 +               dev_err(&pdev->xdev->dev,
57172 +                       "Please compile with CONFIG_PCI_DOMAINS\n");
57173 +               err = -EINVAL;
57174 +               goto err_out;
57175 +       }
57176 +#endif
57177 +
57178 +       dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
57179 +                domain, bus);
57180 +
57181 +       bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
57182 +       sd = kmalloc(sizeof(*sd), GFP_KERNEL);
57183 +       if (!bus_entry || !sd) {
57184 +               err = -ENOMEM;
57185 +               goto err_out;
57186 +       }
57187 +       sd->domain = domain;
57188 +       sd->pdev = pdev;
57189 +
57190 +       b = pci_scan_bus_parented(&pdev->xdev->dev, bus, &pcifront_bus_ops, sd);
57191 +       if (!b) {
57192 +               dev_err(&pdev->xdev->dev, "Error creating PCI Frontend Bus!\n");
57193 +               err = -ENOMEM;
57194 +               goto err_out;
57195 +       }
57196 +       bus_entry->bus = b;
57197 +
57198 +       list_add(&bus_entry->list, &pdev->root_buses);
57199 +
57200 +       /* Claim resources before going "live" with our devices */
57201 +       pci_walk_bus(b, pcifront_claim_resource, pdev);
57202 +
57203 +       pci_bus_add_devices(b);
57204 +
57205 +       return 0;
57206 +
57207 +      err_out:
57208 +       kfree(bus_entry);
57209 +       kfree(sd);
57210 +
57211 +       return err;
57212 +}
57213 +
57214 +static void free_root_bus_devs(struct pci_bus *bus)
57215 +{
57216 +       struct pci_dev *dev;
57217 +
57218 +       spin_lock(&pci_bus_lock);
57219 +       while (!list_empty(&bus->devices)) {
57220 +               dev = container_of(bus->devices.next, struct pci_dev, bus_list);
57221 +               spin_unlock(&pci_bus_lock);
57222 +
57223 +               dev_dbg(&dev->dev, "removing device\n");
57224 +               pci_remove_bus_device(dev);
57225 +
57226 +               spin_lock(&pci_bus_lock);
57227 +       }
57228 +       spin_unlock(&pci_bus_lock);
57229 +}
57230 +
57231 +void pcifront_free_roots(struct pcifront_device *pdev)
57232 +{
57233 +       struct pci_bus_entry *bus_entry, *t;
57234 +
57235 +       dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
57236 +
57237 +       list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
57238 +               list_del(&bus_entry->list);
57239 +
57240 +               free_root_bus_devs(bus_entry->bus);
57241 +
57242 +               kfree(bus_entry->bus->sysdata);
57243 +
57244 +               device_unregister(bus_entry->bus->bridge);
57245 +               pci_remove_bus(bus_entry->bus);
57246 +
57247 +               kfree(bus_entry);
57248 +       }
57249 +}
57250 diff -Nurp ref-linux-2.6.16.9/drivers/xen/pcifront/xenbus.c tmp-linux-2.6-xen.patch/drivers/xen/pcifront/xenbus.c
57251 --- ref-linux-2.6.16.9/drivers/xen/pcifront/xenbus.c    1970-01-01 01:00:00.000000000 +0100
57252 +++ tmp-linux-2.6-xen.patch/drivers/xen/pcifront/xenbus.c       2006-04-10 00:05:52.000000000 +0200
57253 @@ -0,0 +1,294 @@
57254 +/*
57255 + * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
57256 + *
57257 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
57258 + */
57259 +#include <linux/module.h>
57260 +#include <linux/init.h>
57261 +#include <linux/mm.h>
57262 +#include <xen/xenbus.h>
57263 +#include "pcifront.h"
57264 +
57265 +#define INVALID_GRANT_REF (0)
57266 +#define INVALID_EVTCHN    (-1)
57267 +
57268 +static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
57269 +{
57270 +       struct pcifront_device *pdev;
57271 +
57272 +       pdev = kmalloc(sizeof(struct pcifront_device), GFP_KERNEL);
57273 +       if (pdev == NULL)
57274 +               goto out;
57275 +
57276 +       pdev->sh_info =
57277 +           (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
57278 +       if (pdev->sh_info == NULL) {
57279 +               kfree(pdev);
57280 +               pdev = NULL;
57281 +               goto out;
57282 +       }
57283 +       pdev->sh_info->flags = 0;
57284 +
57285 +       xdev->data = pdev;
57286 +       pdev->xdev = xdev;
57287 +
57288 +       INIT_LIST_HEAD(&pdev->root_buses);
57289 +
57290 +       spin_lock_init(&pdev->dev_lock);
57291 +       spin_lock_init(&pdev->sh_info_lock);
57292 +
57293 +       pdev->evtchn = INVALID_EVTCHN;
57294 +       pdev->gnt_ref = INVALID_GRANT_REF;
57295 +
57296 +       dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
57297 +               pdev, pdev->sh_info);
57298 +      out:
57299 +       return pdev;
57300 +}
57301 +
57302 +static void free_pdev(struct pcifront_device *pdev)
57303 +{
57304 +       dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
57305 +
57306 +       pcifront_free_roots(pdev);
57307 +
57308 +       if (pdev->evtchn != INVALID_EVTCHN)
57309 +               xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
57310 +
57311 +       if (pdev->gnt_ref != INVALID_GRANT_REF)
57312 +               gnttab_end_foreign_access(pdev->gnt_ref, 0,
57313 +                                         (unsigned long)pdev->sh_info);
57314 +
57315 +       pdev->xdev->data = NULL;
57316 +
57317 +       kfree(pdev);
57318 +}
57319 +
57320 +static int pcifront_publish_info(struct pcifront_device *pdev)
57321 +{
57322 +       int err = 0;
57323 +       xenbus_transaction_t trans;
57324 +
57325 +       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
57326 +       if (err < 0)
57327 +               goto out;
57328 +
57329 +       pdev->gnt_ref = err;
57330 +
57331 +       err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
57332 +       if (err)
57333 +               goto out;
57334 +
57335 +      do_publish:
57336 +       err = xenbus_transaction_start(&trans);
57337 +       if (err) {
57338 +               xenbus_dev_fatal(pdev->xdev, err,
57339 +                                "Error writing configuration for backend "
57340 +                                "(start transaction)");
57341 +               goto out;
57342 +       }
57343 +
57344 +       err = xenbus_printf(trans, pdev->xdev->nodename,
57345 +                           "pci-op-ref", "%u", pdev->gnt_ref);
57346 +       if (!err)
57347 +               err = xenbus_printf(trans, pdev->xdev->nodename,
57348 +                                   "event-channel", "%u", pdev->evtchn);
57349 +       if (!err)
57350 +               err = xenbus_printf(trans, pdev->xdev->nodename,
57351 +                                   "magic", XEN_PCI_MAGIC);
57352 +
57353 +       if (err) {
57354 +               xenbus_transaction_end(trans, 1);
57355 +               xenbus_dev_fatal(pdev->xdev, err,
57356 +                                "Error writing configuration for backend");
57357 +               goto out;
57358 +       } else {
57359 +               err = xenbus_transaction_end(trans, 0);
57360 +               if (err == -EAGAIN)
57361 +                       goto do_publish;
57362 +               else if (err) {
57363 +                       xenbus_dev_fatal(pdev->xdev, err,
57364 +                                        "Error completing transaction "
57365 +                                        "for backend");
57366 +                       goto out;
57367 +               }
57368 +       }
57369 +
57370 +       xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
57371 +
57372 +       dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
57373 +
57374 +      out:
57375 +       return err;
57376 +}
57377 +
57378 +static int pcifront_try_connect(struct pcifront_device *pdev)
57379 +{
57380 +       int err = -EFAULT;
57381 +       int i, num_roots, len;
57382 +       char str[64];
57383 +       unsigned int domain, bus;
57384 +
57385 +       spin_lock(&pdev->dev_lock);
57386 +
57387 +       /* Only connect once */
57388 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
57389 +           XenbusStateInitialised)
57390 +               goto out;
57391 +
57392 +       err = pcifront_connect(pdev);
57393 +       if (err) {
57394 +               xenbus_dev_fatal(pdev->xdev, err,
57395 +                                "Error connecting PCI Frontend");
57396 +               goto out;
57397 +       }
57398 +
57399 +       err = xenbus_scanf(XBT_NULL, pdev->xdev->otherend,
57400 +                          "root_num", "%d", &num_roots);
57401 +       if (err == -ENOENT) {
57402 +               xenbus_dev_error(pdev->xdev, err,
57403 +                                "No PCI Roots found, trying 0000:00");
57404 +               err = pcifront_scan_root(pdev, 0, 0);
57405 +               num_roots = 0;
57406 +       } else if (err != 1) {
57407 +               if (err == 0)
57408 +                       err = -EINVAL;
57409 +               xenbus_dev_fatal(pdev->xdev, err,
57410 +                                "Error reading number of PCI roots");
57411 +               goto out;
57412 +       }
57413 +
57414 +       for (i = 0; i < num_roots; i++) {
57415 +               len = snprintf(str, sizeof(str), "root-%d", i);
57416 +               if (unlikely(len >= (sizeof(str) - 1))) {
57417 +                       err = -ENOMEM;
57418 +                       goto out;
57419 +               }
57420 +
57421 +               err = xenbus_scanf(XBT_NULL, pdev->xdev->otherend, str,
57422 +                                  "%x:%x", &domain, &bus);
57423 +               if (err != 2) {
57424 +                       if (err >= 0)
57425 +                               err = -EINVAL;
57426 +                       xenbus_dev_fatal(pdev->xdev, err,
57427 +                                        "Error reading PCI root %d", i);
57428 +                       goto out;
57429 +               }
57430 +
57431 +               err = pcifront_scan_root(pdev, domain, bus);
57432 +               if (err) {
57433 +                       xenbus_dev_fatal(pdev->xdev, err,
57434 +                                        "Error scanning PCI root %04x:%02x",
57435 +                                        domain, bus);
57436 +                       goto out;
57437 +               }
57438 +       }
57439 +
57440 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
57441 +       if (err)
57442 +               goto out;
57443 +
57444 +      out:
57445 +       spin_unlock(&pdev->dev_lock);
57446 +       return err;
57447 +}
57448 +
57449 +static int pcifront_try_disconnect(struct pcifront_device *pdev)
57450 +{
57451 +       int err = 0;
57452 +       XenbusState prev_state;
57453 +
57454 +       spin_lock(&pdev->dev_lock);
57455 +
57456 +       prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
57457 +
57458 +       if (prev_state < XenbusStateClosing)
57459 +               err = xenbus_switch_state(pdev->xdev, XenbusStateClosing);
57460 +
57461 +       if (!err && prev_state == XenbusStateConnected)
57462 +               pcifront_disconnect(pdev);
57463 +
57464 +       spin_unlock(&pdev->dev_lock);
57465 +
57466 +       return err;
57467 +}
57468 +
57469 +static void pcifront_backend_changed(struct xenbus_device *xdev,
57470 +                                    XenbusState be_state)
57471 +{
57472 +       struct pcifront_device *pdev = xdev->data;
57473 +
57474 +       switch (be_state) {
57475 +       case XenbusStateClosing:
57476 +               dev_warn(&xdev->dev, "backend going away!\n");
57477 +               pcifront_try_disconnect(pdev);
57478 +               break;
57479 +
57480 +       case XenbusStateClosed:
57481 +               dev_warn(&xdev->dev, "backend went away!\n");
57482 +               pcifront_try_disconnect(pdev);
57483 +
57484 +               device_unregister(&pdev->xdev->dev);
57485 +               break;
57486 +
57487 +       case XenbusStateConnected:
57488 +               pcifront_try_connect(pdev);
57489 +               break;
57490 +
57491 +       default:
57492 +               break;
57493 +       }
57494 +}
57495 +
57496 +static int pcifront_xenbus_probe(struct xenbus_device *xdev,
57497 +                                const struct xenbus_device_id *id)
57498 +{
57499 +       int err = 0;
57500 +       struct pcifront_device *pdev = alloc_pdev(xdev);
57501 +
57502 +       if (pdev == NULL) {
57503 +               err = -ENOMEM;
57504 +               xenbus_dev_fatal(xdev, err,
57505 +                                "Error allocating pcifront_device struct");
57506 +               goto out;
57507 +       }
57508 +
57509 +       err = pcifront_publish_info(pdev);
57510 +
57511 +      out:
57512 +       return err;
57513 +}
57514 +
57515 +static int pcifront_xenbus_remove(struct xenbus_device *xdev)
57516 +{
57517 +       if (xdev->data)
57518 +               free_pdev(xdev->data);
57519 +
57520 +       return 0;
57521 +}
57522 +
57523 +static struct xenbus_device_id xenpci_ids[] = {
57524 +       {"pci"},
57525 +       {{0}},
57526 +};
57527 +
57528 +static struct xenbus_driver xenbus_pcifront_driver = {
57529 +       .name                   = "pcifront",
57530 +       .owner                  = THIS_MODULE,
57531 +       .ids                    = xenpci_ids,
57532 +       .probe                  = pcifront_xenbus_probe,
57533 +       .remove                 = pcifront_xenbus_remove,
57534 +       .otherend_changed       = pcifront_backend_changed,
57535 +};
57536 +
57537 +static int __init pcifront_init(void)
57538 +{
57539 +       int err = 0;
57540 +
57541 +       err = xenbus_register_frontend(&xenbus_pcifront_driver);
57542 +
57543 +       return err;
57544 +}
57545 +
57546 +/* Initialize after the Xen PCI Frontend Stub is initialized */
57547 +subsys_initcall(pcifront_init);
57548 diff -Nurp ref-linux-2.6.16.9/drivers/xen/privcmd/Makefile tmp-linux-2.6-xen.patch/drivers/xen/privcmd/Makefile
57549 --- ref-linux-2.6.16.9/drivers/xen/privcmd/Makefile     1970-01-01 01:00:00.000000000 +0100
57550 +++ tmp-linux-2.6-xen.patch/drivers/xen/privcmd/Makefile        2006-04-10 00:05:52.000000000 +0200
57551 @@ -0,0 +1,2 @@
57552 +
57553 +obj-y  := privcmd.o
57554 diff -Nurp ref-linux-2.6.16.9/drivers/xen/privcmd/privcmd.c tmp-linux-2.6-xen.patch/drivers/xen/privcmd/privcmd.c
57555 --- ref-linux-2.6.16.9/drivers/xen/privcmd/privcmd.c    1970-01-01 01:00:00.000000000 +0100
57556 +++ tmp-linux-2.6-xen.patch/drivers/xen/privcmd/privcmd.c       2006-04-10 00:05:52.000000000 +0200
57557 @@ -0,0 +1,302 @@
57558 +/******************************************************************************
57559 + * privcmd.c
57560 + * 
57561 + * Interface to privileged domain-0 commands.
57562 + * 
57563 + * Copyright (c) 2002-2004, K A Fraser, B Dragovic
57564 + */
57565 +
57566 +#include <linux/config.h>
57567 +#include <linux/kernel.h>
57568 +#include <linux/sched.h>
57569 +#include <linux/slab.h>
57570 +#include <linux/string.h>
57571 +#include <linux/errno.h>
57572 +#include <linux/mm.h>
57573 +#include <linux/mman.h>
57574 +#include <linux/swap.h>
57575 +#include <linux/smp_lock.h>
57576 +#include <linux/highmem.h>
57577 +#include <linux/pagemap.h>
57578 +#include <linux/seq_file.h>
57579 +#include <linux/kthread.h>
57580 +#include <asm/hypervisor.h>
57581 +
57582 +#include <asm/pgalloc.h>
57583 +#include <asm/pgtable.h>
57584 +#include <asm/uaccess.h>
57585 +#include <asm/tlb.h>
57586 +#include <asm/hypervisor.h>
57587 +#include <xen/public/privcmd.h>
57588 +#include <xen/interface/xen.h>
57589 +#include <xen/interface/dom0_ops.h>
57590 +#include <xen/xen_proc.h>
57591 +
57592 +static struct proc_dir_entry *privcmd_intf;
57593 +static struct proc_dir_entry *capabilities_intf;
57594 +
57595 +#define NR_HYPERCALLS 32
57596 +static DECLARE_BITMAP(hypercall_permission_map, NR_HYPERCALLS);
57597 +
57598 +static int privcmd_ioctl(struct inode *inode, struct file *file,
57599 +                         unsigned int cmd, unsigned long data)
57600 +{
57601 +       int ret = -ENOSYS;
57602 +       void __user *udata = (void __user *) data;
57603 +
57604 +       switch (cmd) {
57605 +       case IOCTL_PRIVCMD_HYPERCALL: {
57606 +               privcmd_hypercall_t hypercall;
57607 +  
57608 +               if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
57609 +                       return -EFAULT;
57610 +
57611 +               /* Check hypercall number for validity. */
57612 +               if (hypercall.op >= NR_HYPERCALLS)
57613 +                       return -EINVAL;
57614 +               if (!test_bit(hypercall.op, hypercall_permission_map))
57615 +                       return -EINVAL;
57616 +
57617 +#if defined(__i386__)
57618 +               __asm__ __volatile__ (
57619 +                       "pushl %%ebx; pushl %%ecx; pushl %%edx; "
57620 +                       "pushl %%esi; pushl %%edi; "
57621 +                       "movl  4(%%eax),%%ebx ;"
57622 +                       "movl  8(%%eax),%%ecx ;"
57623 +                       "movl 12(%%eax),%%edx ;"
57624 +                       "movl 16(%%eax),%%esi ;"
57625 +                       "movl 20(%%eax),%%edi ;"
57626 +                       "movl   (%%eax),%%eax ;"
57627 +                       "shll $5,%%eax ;"
57628 +                       "addl $hypercall_page,%%eax ;"
57629 +                       "call *%%eax ;"
57630 +                       "popl %%edi; popl %%esi; popl %%edx; "
57631 +                       "popl %%ecx; popl %%ebx"
57632 +                       : "=a" (ret) : "0" (&hypercall) : "memory" );
57633 +#elif defined (__x86_64__)
57634 +               {
57635 +                       long ign1, ign2, ign3;
57636 +                       __asm__ __volatile__ (
57637 +                               "movq %8,%%r10; movq %9,%%r8;"
57638 +                               "shlq $5,%%rax ;"
57639 +                               "addq $hypercall_page,%%rax ;"
57640 +                               "call *%%rax"
57641 +                               : "=a" (ret), "=D" (ign1),
57642 +                                 "=S" (ign2), "=d" (ign3)
57643 +                               : "0" ((unsigned long)hypercall.op), 
57644 +                               "1" ((unsigned long)hypercall.arg[0]), 
57645 +                               "2" ((unsigned long)hypercall.arg[1]),
57646 +                               "3" ((unsigned long)hypercall.arg[2]), 
57647 +                               "g" ((unsigned long)hypercall.arg[3]),
57648 +                               "g" ((unsigned long)hypercall.arg[4])
57649 +                               : "r8", "r10", "memory" );
57650 +               }
57651 +#elif defined (__ia64__)
57652 +               __asm__ __volatile__ (
57653 +                       ";; mov r14=%2; mov r15=%3; "
57654 +                       "mov r16=%4; mov r17=%5; mov r18=%6;"
57655 +                       "mov r2=%1; break 0x1000;; mov %0=r8 ;;"
57656 +                       : "=r" (ret)
57657 +                       : "r" (hypercall.op),
57658 +                       "r" (hypercall.arg[0]),
57659 +                       "r" (hypercall.arg[1]),
57660 +                       "r" (hypercall.arg[2]),
57661 +                       "r" (hypercall.arg[3]),
57662 +                       "r" (hypercall.arg[4])
57663 +                       : "r14","r15","r16","r17","r18","r2","r8","memory");
57664 +#endif
57665 +       }
57666 +       break;
57667 +
57668 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
57669 +       case IOCTL_PRIVCMD_MMAP: {
57670 +#define PRIVCMD_MMAP_SZ 32
57671 +               privcmd_mmap_t mmapcmd;
57672 +               privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ];
57673 +               privcmd_mmap_entry_t __user *p;
57674 +               int i, rc;
57675 +
57676 +               if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
57677 +                       return -EFAULT;
57678 +
57679 +               p = mmapcmd.entry;
57680 +
57681 +               for (i = 0; i < mmapcmd.num;
57682 +                    i += PRIVCMD_MMAP_SZ, p += PRIVCMD_MMAP_SZ) {
57683 +                       int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)?
57684 +                               PRIVCMD_MMAP_SZ:(mmapcmd.num-i);
57685 +
57686 +                       if (copy_from_user(&msg, p,
57687 +                                          n*sizeof(privcmd_mmap_entry_t)))
57688 +                               return -EFAULT;
57689 +     
57690 +                       for (j = 0; j < n; j++) {
57691 +                               struct vm_area_struct *vma = 
57692 +                                       find_vma( current->mm, msg[j].va );
57693 +
57694 +                               if (!vma)
57695 +                                       return -EINVAL;
57696 +
57697 +                               if (msg[j].va > PAGE_OFFSET)
57698 +                                       return -EINVAL;
57699 +
57700 +                               if ((msg[j].va + (msg[j].npages << PAGE_SHIFT))
57701 +                                   > vma->vm_end )
57702 +                                       return -EINVAL;
57703 +
57704 +                               if ((rc = direct_remap_pfn_range(
57705 +                                       vma,
57706 +                                       msg[j].va&PAGE_MASK, 
57707 +                                       msg[j].mfn, 
57708 +                                       msg[j].npages<<PAGE_SHIFT, 
57709 +                                       vma->vm_page_prot,
57710 +                                       mmapcmd.dom)) < 0)
57711 +                                       return rc;
57712 +                       }
57713 +               }
57714 +               ret = 0;
57715 +       }
57716 +       break;
57717 +
57718 +       case IOCTL_PRIVCMD_MMAPBATCH: {
57719 +               mmu_update_t u;
57720 +               privcmd_mmapbatch_t m;
57721 +               struct vm_area_struct *vma = NULL;
57722 +               unsigned long __user *p;
57723 +               unsigned long addr, mfn; 
57724 +               uint64_t ptep;
57725 +               int i;
57726 +
57727 +               if (copy_from_user(&m, udata, sizeof(m))) {
57728 +                       ret = -EFAULT;
57729 +                       goto batch_err;
57730 +               }
57731 +
57732 +               if (m.dom == DOMID_SELF) {
57733 +                       ret = -EINVAL;
57734 +                       goto batch_err;
57735 +               }
57736 +
57737 +               vma = find_vma(current->mm, m.addr);
57738 +               if (!vma) {
57739 +                       ret = -EINVAL;
57740 +                       goto batch_err;
57741 +               }
57742 +
57743 +               if (m.addr > PAGE_OFFSET) {
57744 +                       ret = -EFAULT;
57745 +                       goto batch_err;
57746 +               }
57747 +
57748 +               if ((m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end) {
57749 +                       ret = -EFAULT;
57750 +                       goto batch_err;
57751 +               }
57752 +
57753 +               p = m.arr;
57754 +               addr = m.addr;
57755 +               for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) {
57756 +                       if (get_user(mfn, p))
57757 +                               return -EFAULT;
57758 +#ifdef __ia64__
57759 +                       ret = remap_pfn_range(vma,
57760 +                                             addr&PAGE_MASK,
57761 +                                             mfn,
57762 +                                             1<<PAGE_SHIFT,
57763 +                                             vma->vm_page_prot);
57764 +                       if (ret < 0)
57765 +                           goto batch_err;
57766 +#else
57767 +
57768 +                       ret = create_lookup_pte_addr(vma->vm_mm, addr, &ptep);
57769 +                       if (ret)
57770 +                               goto batch_err;
57771 +
57772 +                       u.val = pte_val_ma(pfn_pte_ma(mfn, vma->vm_page_prot));
57773 +                       u.ptr = ptep;
57774 +
57775 +                       if (HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0)
57776 +                               put_user(0xF0000000 | mfn, p);
57777 +#endif
57778 +               }
57779 +
57780 +               ret = 0;
57781 +               break;
57782 +
57783 +       batch_err:
57784 +               printk("batch_err ret=%d vma=%p addr=%lx "
57785 +                      "num=%d arr=%p %lx-%lx\n", 
57786 +                      ret, vma, m.addr, m.num, m.arr,
57787 +                      vma ? vma->vm_start : 0, vma ? vma->vm_end : 0);
57788 +               break;
57789 +       }
57790 +       break;
57791 +#endif
57792 +
57793 +       default:
57794 +               ret = -EINVAL;
57795 +               break;
57796 +       }
57797 +
57798 +       return ret;
57799 +}
57800 +
57801 +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
57802 +{
57803 +       /* DONTCOPY is essential for Xen as copy_page_range is broken. */
57804 +       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
57805 +
57806 +       return 0;
57807 +}
57808 +
57809 +static struct file_operations privcmd_file_ops = {
57810 +       .ioctl = privcmd_ioctl,
57811 +       .mmap  = privcmd_mmap,
57812 +};
57813 +
57814 +static int capabilities_read(char *page, char **start, off_t off,
57815 +                        int count, int *eof, void *data)
57816 +{
57817 +       int len = 0;
57818 +       *page = 0;
57819 +
57820 +       if (xen_start_info->flags & SIF_INITDOMAIN)
57821 +               len = sprintf( page, "control_d\n" );
57822 +
57823 +       *eof = 1;
57824 +       return len;
57825 +}
57826 +
57827 +static int __init privcmd_init(void)
57828 +{
57829 +       /* Set of hypercalls that privileged applications may execute. */
57830 +       set_bit(__HYPERVISOR_acm_op,           hypercall_permission_map);
57831 +       set_bit(__HYPERVISOR_dom0_op,          hypercall_permission_map);
57832 +       set_bit(__HYPERVISOR_event_channel_op, hypercall_permission_map);
57833 +       set_bit(__HYPERVISOR_memory_op,        hypercall_permission_map);
57834 +       set_bit(__HYPERVISOR_mmu_update,       hypercall_permission_map);
57835 +       set_bit(__HYPERVISOR_mmuext_op,        hypercall_permission_map);
57836 +       set_bit(__HYPERVISOR_xen_version,      hypercall_permission_map);
57837 +
57838 +       privcmd_intf = create_xen_proc_entry("privcmd", 0400);
57839 +       if (privcmd_intf != NULL)
57840 +               privcmd_intf->proc_fops = &privcmd_file_ops;
57841 +
57842 +       capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
57843 +       if (capabilities_intf != NULL)
57844 +               capabilities_intf->read_proc = capabilities_read;
57845 +
57846 +       return 0;
57847 +}
57848 +
57849 +__initcall(privcmd_init);
57850 +
57851 +/*
57852 + * Local variables:
57853 + *  c-file-style: "linux"
57854 + *  indent-tabs-mode: t
57855 + *  c-indent-level: 8
57856 + *  c-basic-offset: 8
57857 + *  tab-width: 8
57858 + * End:
57859 + */
57860 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmback/common.h tmp-linux-2.6-xen.patch/drivers/xen/tpmback/common.h
57861 --- ref-linux-2.6.16.9/drivers/xen/tpmback/common.h     1970-01-01 01:00:00.000000000 +0100
57862 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmback/common.h        2006-04-10 00:05:52.000000000 +0200
57863 @@ -0,0 +1,91 @@
57864 +/******************************************************************************
57865 + * drivers/xen/tpmback/common.h
57866 + */
57867 +
57868 +#ifndef __NETIF__BACKEND__COMMON_H__
57869 +#define __NETIF__BACKEND__COMMON_H__
57870 +
57871 +#include <linux/config.h>
57872 +#include <linux/version.h>
57873 +#include <linux/module.h>
57874 +#include <linux/interrupt.h>
57875 +#include <linux/slab.h>
57876 +#include <xen/evtchn.h>
57877 +#include <xen/driver_util.h>
57878 +#include <xen/interface/grant_table.h>
57879 +#include <xen/interface/io/tpmif.h>
57880 +#include <asm/io.h>
57881 +#include <asm/pgalloc.h>
57882 +
57883 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
57884 +                                    __FILE__ , __LINE__ , ## _a )
57885 +
57886 +typedef struct tpmif_st {
57887 +       struct list_head tpmif_list;
57888 +       /* Unique identifier for this interface. */
57889 +       domid_t domid;
57890 +       unsigned int handle;
57891 +
57892 +       /* Physical parameters of the comms window. */
57893 +       unsigned int evtchn;
57894 +       unsigned int irq;
57895 +
57896 +       /* The shared rings and indexes. */
57897 +       tpmif_tx_interface_t *tx;
57898 +       struct vm_struct *tx_area;
57899 +
57900 +       /* Miscellaneous private stuff. */
57901 +       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
57902 +       int active;
57903 +
57904 +       struct tpmif_st *hash_next;
57905 +       struct list_head list;  /* scheduling list */
57906 +       atomic_t refcnt;
57907 +
57908 +       long int tpm_instance;
57909 +       unsigned long mmap_vstart;
57910 +
57911 +       struct work_struct work;
57912 +
57913 +       grant_handle_t shmem_handle;
57914 +       grant_ref_t shmem_ref;
57915 +} tpmif_t;
57916 +
57917 +void tpmif_disconnect_complete(tpmif_t * tpmif);
57918 +tpmif_t *tpmif_find(domid_t domid, long int instance);
57919 +void tpmif_interface_init(void);
57920 +void tpmif_interface_exit(void);
57921 +void tpmif_schedule_work(tpmif_t * tpmif);
57922 +void tpmif_deschedule_work(tpmif_t * tpmif);
57923 +void tpmif_xenbus_init(void);
57924 +void tpmif_xenbus_exit(void);
57925 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
57926 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
57927 +int tpmif_vtpm_open(tpmif_t *tpmif, domid_t domain, u32 instance);
57928 +int tpmif_vtpm_close(u32 instance);
57929 +
57930 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
57931 +
57932 +#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
57933 +#define tpmif_put(_b)                             \
57934 +    do {                                          \
57935 +        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
57936 +            tpmif_disconnect_complete(_b);        \
57937 +    } while (0)
57938 +
57939 +
57940 +extern int num_frontends;
57941 +
57942 +#define MMAP_VADDR(t,_req) ((t)->mmap_vstart + ((_req) * PAGE_SIZE))
57943 +
57944 +#endif /* __TPMIF__BACKEND__COMMON_H__ */
57945 +
57946 +/*
57947 + * Local variables:
57948 + *  c-file-style: "linux"
57949 + *  indent-tabs-mode: t
57950 + *  c-indent-level: 8
57951 + *  c-basic-offset: 8
57952 + *  tab-width: 8
57953 + * End:
57954 + */
57955 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmback/interface.c tmp-linux-2.6-xen.patch/drivers/xen/tpmback/interface.c
57956 --- ref-linux-2.6.16.9/drivers/xen/tpmback/interface.c  1970-01-01 01:00:00.000000000 +0100
57957 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmback/interface.c     2006-04-10 00:05:52.000000000 +0200
57958 @@ -0,0 +1,194 @@
57959 + /*****************************************************************************
57960 + * drivers/xen/tpmback/interface.c
57961 + *
57962 + * Vritual TPM interface management.
57963 + *
57964 + * Copyright (c) 2005, IBM Corporation
57965 + *
57966 + * Author: Stefan Berger, stefanb@us.ibm.com
57967 + *
57968 + * This code has been derived from drivers/xen/netback/interface.c
57969 + * Copyright (c) 2004, Keir Fraser
57970 + */
57971 +
57972 +#include "common.h"
57973 +#include <xen/balloon.h>
57974 +
57975 +static kmem_cache_t *tpmif_cachep;
57976 +int num_frontends = 0;
57977 +
57978 +LIST_HEAD(tpmif_list);
57979 +
57980 +static tpmif_t *alloc_tpmif(domid_t domid, long int instance)
57981 +{
57982 +       struct page *page;
57983 +       tpmif_t *tpmif;
57984 +
57985 +       tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL);
57986 +       if (!tpmif)
57987 +               return ERR_PTR(-ENOMEM);
57988 +
57989 +       memset(tpmif, 0, sizeof (*tpmif));
57990 +       tpmif->domid = domid;
57991 +       tpmif->status = DISCONNECTED;
57992 +       tpmif->tpm_instance = instance;
57993 +       atomic_set(&tpmif->refcnt, 1);
57994 +
57995 +       page = balloon_alloc_empty_page_range(TPMIF_TX_RING_SIZE);
57996 +       BUG_ON(page == NULL);
57997 +       tpmif->mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
57998 +
57999 +       list_add(&tpmif->tpmif_list, &tpmif_list);
58000 +       num_frontends++;
58001 +
58002 +       return tpmif;
58003 +}
58004 +
58005 +static void free_tpmif(tpmif_t * tpmif)
58006 +{
58007 +       num_frontends--;
58008 +       list_del(&tpmif->tpmif_list);
58009 +       kmem_cache_free(tpmif_cachep, tpmif);
58010 +}
58011 +
58012 +tpmif_t *tpmif_find(domid_t domid, long int instance)
58013 +{
58014 +       tpmif_t *tpmif;
58015 +
58016 +       list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
58017 +               if (tpmif->tpm_instance == instance) {
58018 +                       if (tpmif->domid == domid) {
58019 +                               tpmif_get(tpmif);
58020 +                               return tpmif;
58021 +                       } else {
58022 +                               return ERR_PTR(-EEXIST);
58023 +                       }
58024 +               }
58025 +       }
58026 +
58027 +       return alloc_tpmif(domid, instance);
58028 +}
58029 +
58030 +static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page)
58031 +{
58032 +       int ret;
58033 +       struct gnttab_map_grant_ref op = {
58034 +               .host_addr = (unsigned long)tpmif->tx_area->addr,
58035 +               .flags = GNTMAP_host_map,
58036 +               .ref = shared_page,
58037 +               .dom = tpmif->domid,
58038 +       };
58039 +
58040 +       lock_vm_area(tpmif->tx_area);
58041 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
58042 +       unlock_vm_area(tpmif->tx_area);
58043 +       BUG_ON(ret);
58044 +
58045 +       if (op.status) {
58046 +               DPRINTK(" Grant table operation failure !\n");
58047 +               return op.status;
58048 +       }
58049 +
58050 +       tpmif->shmem_ref = shared_page;
58051 +       tpmif->shmem_handle = op.handle;
58052 +
58053 +       return 0;
58054 +}
58055 +
58056 +static void unmap_frontend_page(tpmif_t *tpmif)
58057 +{
58058 +       struct gnttab_unmap_grant_ref op;
58059 +       int ret;
58060 +
58061 +       op.host_addr    = (unsigned long)tpmif->tx_area->addr;
58062 +       op.handle       = tpmif->shmem_handle;
58063 +       op.dev_bus_addr = 0;
58064 +
58065 +       lock_vm_area(tpmif->tx_area);
58066 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
58067 +       unlock_vm_area(tpmif->tx_area);
58068 +       BUG_ON(ret);
58069 +}
58070 +
58071 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn)
58072 +{
58073 +       int err;
58074 +       evtchn_op_t op = {
58075 +               .cmd = EVTCHNOP_bind_interdomain,
58076 +               .u.bind_interdomain.remote_dom = tpmif->domid,
58077 +               .u.bind_interdomain.remote_port = evtchn,
58078 +        };
58079 +
58080 +        if (tpmif->irq) {
58081 +                return 0;
58082 +        }
58083 +
58084 +       if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL)
58085 +               return -ENOMEM;
58086 +
58087 +       err = map_frontend_page(tpmif, shared_page);
58088 +       if (err) {
58089 +               free_vm_area(tpmif->tx_area);
58090 +               return err;
58091 +       }
58092 +
58093 +       err = HYPERVISOR_event_channel_op(&op);
58094 +       if (err) {
58095 +               unmap_frontend_page(tpmif);
58096 +               free_vm_area(tpmif->tx_area);
58097 +               return err;
58098 +       }
58099 +
58100 +       tpmif->evtchn = op.u.bind_interdomain.local_port;
58101 +
58102 +       tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr;
58103 +
58104 +       tpmif->irq = bind_evtchn_to_irqhandler(
58105 +               tpmif->evtchn, tpmif_be_int, 0, "tpmif-backend", tpmif);
58106 +       tpmif->shmem_ref = shared_page;
58107 +       tpmif->active = 1;
58108 +
58109 +       return 0;
58110 +}
58111 +
58112 +static void __tpmif_disconnect_complete(void *arg)
58113 +{
58114 +       tpmif_t *tpmif = (tpmif_t *) arg;
58115 +
58116 +       if (tpmif->irq)
58117 +               unbind_from_irqhandler(tpmif->irq, tpmif);
58118 +
58119 +       if (tpmif->tx) {
58120 +               unmap_frontend_page(tpmif);
58121 +               free_vm_area(tpmif->tx_area);
58122 +       }
58123 +
58124 +       free_tpmif(tpmif);
58125 +}
58126 +
58127 +void tpmif_disconnect_complete(tpmif_t * tpmif)
58128 +{
58129 +       INIT_WORK(&tpmif->work, __tpmif_disconnect_complete, (void *)tpmif);
58130 +       schedule_work(&tpmif->work);
58131 +}
58132 +
58133 +void __init tpmif_interface_init(void)
58134 +{
58135 +       tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
58136 +                                        0, 0, NULL, NULL);
58137 +}
58138 +
58139 +void __init tpmif_interface_exit(void)
58140 +{
58141 +       kmem_cache_destroy(tpmif_cachep);
58142 +}
58143 +
58144 +/*
58145 + * Local variables:
58146 + *  c-file-style: "linux"
58147 + *  indent-tabs-mode: t
58148 + *  c-indent-level: 8
58149 + *  c-basic-offset: 8
58150 + *  tab-width: 8
58151 + * End:
58152 + */
58153 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmback/Makefile tmp-linux-2.6-xen.patch/drivers/xen/tpmback/Makefile
58154 --- ref-linux-2.6.16.9/drivers/xen/tpmback/Makefile     1970-01-01 01:00:00.000000000 +0100
58155 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmback/Makefile        2006-04-10 00:05:52.000000000 +0200
58156 @@ -0,0 +1,4 @@
58157 +
58158 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmbk.o
58159 +
58160 +tpmbk-y += tpmback.o interface.o xenbus.o
58161 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmback/tpmback.c tmp-linux-2.6-xen.patch/drivers/xen/tpmback/tpmback.c
58162 --- ref-linux-2.6.16.9/drivers/xen/tpmback/tpmback.c    1970-01-01 01:00:00.000000000 +0100
58163 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmback/tpmback.c       2006-04-10 00:05:52.000000000 +0200
58164 @@ -0,0 +1,1060 @@
58165 +/******************************************************************************
58166 + * drivers/xen/tpmback/tpmback.c
58167 + *
58168 + * Copyright (c) 2005, IBM Corporation
58169 + *
58170 + * Author: Stefan Berger, stefanb@us.ibm.com
58171 + * Grant table support: Mahadevan Gomathisankaran
58172 + *
58173 + * This code has been derived from drivers/xen/netback/netback.c
58174 + * Copyright (c) 2002-2004, K A Fraser
58175 + *
58176 + */
58177 +
58178 +#include "common.h"
58179 +#include <xen/evtchn.h>
58180 +
58181 +#include <linux/types.h>
58182 +#include <linux/list.h>
58183 +#include <linux/miscdevice.h>
58184 +#include <linux/poll.h>
58185 +#include <asm/uaccess.h>
58186 +#include <xen/xenbus.h>
58187 +#include <xen/interface/grant_table.h>
58188 +
58189 +/* local data structures */
58190 +struct data_exchange {
58191 +       struct list_head pending_pak;
58192 +       struct list_head current_pak;
58193 +       unsigned int copied_so_far;
58194 +       u8 has_opener;
58195 +       rwlock_t pak_lock;      // protects all of the previous fields
58196 +       wait_queue_head_t wait_queue;
58197 +};
58198 +
58199 +struct vtpm_resp_hdr {
58200 +       uint32_t instance_no;
58201 +       uint16_t tag_no;
58202 +       uint32_t len_no;
58203 +       uint32_t ordinal_no;
58204 +} __attribute__ ((packed));
58205 +
58206 +struct packet {
58207 +       struct list_head next;
58208 +       unsigned int data_len;
58209 +       u8 *data_buffer;
58210 +       tpmif_t *tpmif;
58211 +       u32 tpm_instance;
58212 +       u8 req_tag;
58213 +       u32 last_read;
58214 +       u8 flags;
58215 +       struct timer_list processing_timer;
58216 +};
58217 +
58218 +enum {
58219 +       PACKET_FLAG_DISCARD_RESPONSE = 1,
58220 +       PACKET_FLAG_CHECK_RESPONSESTATUS = 2,
58221 +};
58222 +
58223 +/* local variables */
58224 +static struct data_exchange dataex;
58225 +
58226 +/* local function prototypes */
58227 +static int _packet_write(struct packet *pak,
58228 +                        const char *data, size_t size, int userbuffer);
58229 +static void processing_timeout(unsigned long ptr);
58230 +static int packet_read_shmem(struct packet *pak,
58231 +                            tpmif_t * tpmif,
58232 +                            u32 offset,
58233 +                            char *buffer, int isuserbuffer, u32 left);
58234 +static int vtpm_queue_packet(struct packet *pak);
58235 +
58236 +#define MIN(x,y)  (x) < (y) ? (x) : (y)
58237 +
58238 +/***************************************************************
58239 + Buffer copying fo user and kernel space buffes.
58240 +***************************************************************/
58241 +static inline int copy_from_buffer(void *to,
58242 +                                  const void *from, unsigned long size,
58243 +                                  int isuserbuffer)
58244 +{
58245 +       if (isuserbuffer) {
58246 +               if (copy_from_user(to, (void __user *)from, size))
58247 +                       return -EFAULT;
58248 +       } else {
58249 +               memcpy(to, from, size);
58250 +       }
58251 +       return 0;
58252 +}
58253 +
58254 +static inline int copy_to_buffer(void *to,
58255 +                                const void *from, unsigned long size,
58256 +                                int isuserbuffer)
58257 +{
58258 +       if (isuserbuffer) {
58259 +               if (copy_to_user((void __user *)to, from, size))
58260 +                       return -EFAULT;
58261 +       } else {
58262 +               memcpy(to, from, size);
58263 +       }
58264 +       return 0;
58265 +}
58266 +
58267 +/***************************************************************
58268 + Packet-related functions
58269 +***************************************************************/
58270 +
58271 +static struct packet *packet_find_instance(struct list_head *head,
58272 +                                          u32 tpm_instance)
58273 +{
58274 +       struct packet *pak;
58275 +       struct list_head *p;
58276 +
58277 +       /*
58278 +        * traverse the list of packets and return the first
58279 +        * one with the given instance number
58280 +        */
58281 +       list_for_each(p, head) {
58282 +               pak = list_entry(p, struct packet, next);
58283 +
58284 +               if (pak->tpm_instance == tpm_instance) {
58285 +                       return pak;
58286 +               }
58287 +       }
58288 +       return NULL;
58289 +}
58290 +
58291 +static struct packet *packet_find_packet(struct list_head *head, void *packet)
58292 +{
58293 +       struct packet *pak;
58294 +       struct list_head *p;
58295 +
58296 +       /*
58297 +        * traverse the list of packets and return the first
58298 +        * one with the given instance number
58299 +        */
58300 +       list_for_each(p, head) {
58301 +               pak = list_entry(p, struct packet, next);
58302 +
58303 +               if (pak == packet) {
58304 +                       return pak;
58305 +               }
58306 +       }
58307 +       return NULL;
58308 +}
58309 +
58310 +static struct packet *packet_alloc(tpmif_t * tpmif,
58311 +                                  u32 size, u8 req_tag, u8 flags)
58312 +{
58313 +       struct packet *pak = NULL;
58314 +       pak = kzalloc(sizeof (struct packet), GFP_KERNEL);
58315 +       if (NULL != pak) {
58316 +               if (tpmif) {
58317 +                       pak->tpmif = tpmif;
58318 +                       pak->tpm_instance = tpmif->tpm_instance;
58319 +               }
58320 +               pak->data_len = size;
58321 +               pak->req_tag = req_tag;
58322 +               pak->last_read = 0;
58323 +               pak->flags = flags;
58324 +
58325 +               /*
58326 +                * cannot do tpmif_get(tpmif); bad things happen
58327 +                * on the last tpmif_put()
58328 +                */
58329 +               init_timer(&pak->processing_timer);
58330 +               pak->processing_timer.function = processing_timeout;
58331 +               pak->processing_timer.data = (unsigned long)pak;
58332 +       }
58333 +       return pak;
58334 +}
58335 +
58336 +static void inline packet_reset(struct packet *pak)
58337 +{
58338 +       pak->last_read = 0;
58339 +}
58340 +
58341 +static void packet_free(struct packet *pak)
58342 +{
58343 +       if (timer_pending(&pak->processing_timer)) {
58344 +               BUG();
58345 +       }
58346 +       kfree(pak->data_buffer);
58347 +       /*
58348 +        * cannot do tpmif_put(pak->tpmif); bad things happen
58349 +        * on the last tpmif_put()
58350 +        */
58351 +       kfree(pak);
58352 +}
58353 +
58354 +static int packet_set(struct packet *pak,
58355 +                     const unsigned char *buffer, u32 size)
58356 +{
58357 +       int rc = 0;
58358 +       unsigned char *buf = kmalloc(size, GFP_KERNEL);
58359 +
58360 +       if (buf) {
58361 +               pak->data_buffer = buf;
58362 +               memcpy(buf, buffer, size);
58363 +               pak->data_len = size;
58364 +       } else {
58365 +               rc = -ENOMEM;
58366 +       }
58367 +       return rc;
58368 +}
58369 +
58370 +/*
58371 + * Write data to the shared memory and send it to the FE.
58372 + */
58373 +static int packet_write(struct packet *pak,
58374 +                       const char *data, size_t size, int isuserbuffer)
58375 +{
58376 +       int rc = 0;
58377 +
58378 +       if ((pak->flags & PACKET_FLAG_CHECK_RESPONSESTATUS)) {
58379 +#ifdef CONFIG_XEN_TPMDEV_CLOSE_IF_VTPM_FAILS
58380 +               u32 res;
58381 +
58382 +               if (copy_from_buffer(&res,
58383 +                                    &data[2 + 4], sizeof (res),
58384 +                                    isuserbuffer)) {
58385 +                       return -EFAULT;
58386 +               }
58387 +
58388 +               if (res != 0) {
58389 +                       /*
58390 +                        * Close down this device. Should have the
58391 +                        * FE notified about closure.
58392 +                        */
58393 +                       if (!pak->tpmif) {
58394 +                               return -EFAULT;
58395 +                       }
58396 +                       pak->tpmif->status = DISCONNECTING;
58397 +               }
58398 +#endif
58399 +       }
58400 +
58401 +       if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
58402 +               /* Don't send a respone to this packet. Just acknowledge it. */
58403 +               rc = size;
58404 +       } else {
58405 +               rc = _packet_write(pak, data, size, isuserbuffer);
58406 +       }
58407 +
58408 +       return rc;
58409 +}
58410 +
58411 +int _packet_write(struct packet *pak,
58412 +                 const char *data, size_t size, int isuserbuffer)
58413 +{
58414 +       /*
58415 +        * Write into the shared memory pages directly
58416 +        * and send it to the front end.
58417 +        */
58418 +       tpmif_t *tpmif = pak->tpmif;
58419 +       grant_handle_t handle;
58420 +       int rc = 0;
58421 +       unsigned int i = 0;
58422 +       unsigned int offset = 0;
58423 +
58424 +       if (tpmif == NULL) {
58425 +               return -EFAULT;
58426 +       }
58427 +
58428 +       if (tpmif->status == DISCONNECTED) {
58429 +               return size;
58430 +       }
58431 +
58432 +       while (offset < size && i < TPMIF_TX_RING_SIZE) {
58433 +               unsigned int tocopy;
58434 +               struct gnttab_map_grant_ref map_op;
58435 +               struct gnttab_unmap_grant_ref unmap_op;
58436 +               tpmif_tx_request_t *tx;
58437 +
58438 +               tx = &tpmif->tx->ring[i].req;
58439 +
58440 +               if (0 == tx->addr) {
58441 +                       DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
58442 +                       return 0;
58443 +               }
58444 +
58445 +               map_op.host_addr = MMAP_VADDR(tpmif, i);
58446 +               map_op.flags = GNTMAP_host_map;
58447 +               map_op.ref = tx->ref;
58448 +               map_op.dom = tpmif->domid;
58449 +
58450 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
58451 +                                                      &map_op, 1))) {
58452 +                       BUG();
58453 +               }
58454 +
58455 +               handle = map_op.handle;
58456 +
58457 +               if (map_op.status) {
58458 +                       DPRINTK(" Grant table operation failure !\n");
58459 +                       return 0;
58460 +               }
58461 +               set_phys_to_machine(__pa(MMAP_VADDR(tpmif, i)) >> PAGE_SHIFT,
58462 +                                   FOREIGN_FRAME(map_op.
58463 +                                                 dev_bus_addr >> PAGE_SHIFT));
58464 +
58465 +               tocopy = MIN(size - offset, PAGE_SIZE);
58466 +
58467 +               if (copy_from_buffer((void *)(MMAP_VADDR(tpmif, i) |
58468 +                                             (tx->addr & ~PAGE_MASK)),
58469 +                                    &data[offset], tocopy, isuserbuffer)) {
58470 +                       tpmif_put(tpmif);
58471 +                       return -EFAULT;
58472 +               }
58473 +               tx->size = tocopy;
58474 +
58475 +               unmap_op.host_addr = MMAP_VADDR(tpmif, i);
58476 +               unmap_op.handle = handle;
58477 +               unmap_op.dev_bus_addr = 0;
58478 +
58479 +               if (unlikely
58480 +                   (HYPERVISOR_grant_table_op
58481 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
58482 +                       BUG();
58483 +               }
58484 +
58485 +               offset += tocopy;
58486 +               i++;
58487 +       }
58488 +
58489 +       rc = offset;
58490 +       DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
58491 +       notify_remote_via_irq(tpmif->irq);
58492 +
58493 +       return rc;
58494 +}
58495 +
58496 +/*
58497 + * Read data from the shared memory and copy it directly into the
58498 + * provided buffer. Advance the read_last indicator which tells
58499 + * how many bytes have already been read.
58500 + */
58501 +static int packet_read(struct packet *pak, size_t numbytes,
58502 +                      char *buffer, size_t buffersize, int isuserbuffer)
58503 +{
58504 +       tpmif_t *tpmif = pak->tpmif;
58505 +
58506 +       /*
58507 +        * Read 'numbytes' of data from the buffer. The first 4
58508 +        * bytes are the instance number in network byte order,
58509 +        * after that come the data from the shared memory buffer.
58510 +        */
58511 +       u32 to_copy;
58512 +       u32 offset = 0;
58513 +       u32 room_left = buffersize;
58514 +
58515 +       if (pak->last_read < 4) {
58516 +               /*
58517 +                * copy the instance number into the buffer
58518 +                */
58519 +               u32 instance_no = htonl(pak->tpm_instance);
58520 +               u32 last_read = pak->last_read;
58521 +
58522 +               to_copy = MIN(4 - last_read, numbytes);
58523 +
58524 +               if (copy_to_buffer(&buffer[0],
58525 +                                  &(((u8 *) & instance_no)[last_read]),
58526 +                                  to_copy, isuserbuffer)) {
58527 +                       return -EFAULT;
58528 +               }
58529 +
58530 +               pak->last_read += to_copy;
58531 +               offset += to_copy;
58532 +               room_left -= to_copy;
58533 +       }
58534 +
58535 +       /*
58536 +        * If the packet has a data buffer appended, read from it...
58537 +        */
58538 +
58539 +       if (room_left > 0) {
58540 +               if (pak->data_buffer) {
58541 +                       u32 to_copy = MIN(pak->data_len - offset, room_left);
58542 +                       u32 last_read = pak->last_read - 4;
58543 +
58544 +                       if (copy_to_buffer(&buffer[offset],
58545 +                                          &pak->data_buffer[last_read],
58546 +                                          to_copy, isuserbuffer)) {
58547 +                               return -EFAULT;
58548 +                       }
58549 +                       pak->last_read += to_copy;
58550 +                       offset += to_copy;
58551 +               } else {
58552 +                       offset = packet_read_shmem(pak,
58553 +                                                  tpmif,
58554 +                                                  offset,
58555 +                                                  buffer,
58556 +                                                  isuserbuffer, room_left);
58557 +               }
58558 +       }
58559 +       return offset;
58560 +}
58561 +
58562 +static int packet_read_shmem(struct packet *pak,
58563 +                            tpmif_t * tpmif,
58564 +                            u32 offset, char *buffer, int isuserbuffer,
58565 +                            u32 room_left)
58566 +{
58567 +       u32 last_read = pak->last_read - 4;
58568 +       u32 i = (last_read / PAGE_SIZE);
58569 +       u32 pg_offset = last_read & (PAGE_SIZE - 1);
58570 +       u32 to_copy;
58571 +       grant_handle_t handle;
58572 +
58573 +       tpmif_tx_request_t *tx;
58574 +
58575 +       tx = &tpmif->tx->ring[0].req;
58576 +       /*
58577 +        * Start copying data at the page with index 'index'
58578 +        * and within that page at offset 'offset'.
58579 +        * Copy a maximum of 'room_left' bytes.
58580 +        */
58581 +       to_copy = MIN(PAGE_SIZE - pg_offset, room_left);
58582 +       while (to_copy > 0) {
58583 +               void *src;
58584 +               struct gnttab_map_grant_ref map_op;
58585 +               struct gnttab_unmap_grant_ref unmap_op;
58586 +
58587 +               tx = &tpmif->tx->ring[i].req;
58588 +
58589 +               map_op.host_addr = MMAP_VADDR(tpmif, i);
58590 +               map_op.flags = GNTMAP_host_map;
58591 +               map_op.ref = tx->ref;
58592 +               map_op.dom = tpmif->domid;
58593 +
58594 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
58595 +                                                      &map_op, 1))) {
58596 +                       BUG();
58597 +               }
58598 +
58599 +               if (map_op.status) {
58600 +                       DPRINTK(" Grant table operation failure !\n");
58601 +                       return -EFAULT;
58602 +               }
58603 +
58604 +               handle = map_op.handle;
58605 +
58606 +               if (to_copy > tx->size) {
58607 +                       /*
58608 +                        * User requests more than what's available
58609 +                        */
58610 +                       to_copy = MIN(tx->size, to_copy);
58611 +               }
58612 +
58613 +               DPRINTK("Copying from mapped memory at %08lx\n",
58614 +                       (unsigned long)(MMAP_VADDR(tpmif, i) |
58615 +                                       (tx->addr & ~PAGE_MASK)));
58616 +
58617 +               src = (void *)(MMAP_VADDR(tpmif, i) |
58618 +                              ((tx->addr & ~PAGE_MASK) + pg_offset));
58619 +               if (copy_to_buffer(&buffer[offset],
58620 +                                  src, to_copy, isuserbuffer)) {
58621 +                       return -EFAULT;
58622 +               }
58623 +
58624 +               DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
58625 +                       tpmif->domid, buffer[offset], buffer[offset + 1],
58626 +                       buffer[offset + 2], buffer[offset + 3]);
58627 +
58628 +               unmap_op.host_addr = MMAP_VADDR(tpmif, i);
58629 +               unmap_op.handle = handle;
58630 +               unmap_op.dev_bus_addr = 0;
58631 +
58632 +               if (unlikely
58633 +                   (HYPERVISOR_grant_table_op
58634 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
58635 +                       BUG();
58636 +               }
58637 +
58638 +               offset += to_copy;
58639 +               pg_offset = 0;
58640 +               last_read += to_copy;
58641 +               room_left -= to_copy;
58642 +
58643 +               to_copy = MIN(PAGE_SIZE, room_left);
58644 +               i++;
58645 +       }                       /* while (to_copy > 0) */
58646 +       /*
58647 +        * Adjust the last_read pointer
58648 +        */
58649 +       pak->last_read = last_read + 4;
58650 +       return offset;
58651 +}
58652 +
58653 +/* ============================================================
58654 + * The file layer for reading data from this device
58655 + * ============================================================
58656 + */
58657 +static int vtpm_op_open(struct inode *inode, struct file *f)
58658 +{
58659 +       int rc = 0;
58660 +       unsigned long flags;
58661 +
58662 +       write_lock_irqsave(&dataex.pak_lock, flags);
58663 +       if (dataex.has_opener == 0) {
58664 +               dataex.has_opener = 1;
58665 +       } else {
58666 +               rc = -EPERM;
58667 +       }
58668 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58669 +       return rc;
58670 +}
58671 +
58672 +static ssize_t vtpm_op_read(struct file *file,
58673 +                           char __user * data, size_t size, loff_t * offset)
58674 +{
58675 +       int ret_size = -ENODATA;
58676 +       struct packet *pak = NULL;
58677 +       unsigned long flags;
58678 +
58679 +       write_lock_irqsave(&dataex.pak_lock, flags);
58680 +
58681 +       if (list_empty(&dataex.pending_pak)) {
58682 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
58683 +               wait_event_interruptible(dataex.wait_queue,
58684 +                                        !list_empty(&dataex.pending_pak));
58685 +               write_lock_irqsave(&dataex.pak_lock, flags);
58686 +       }
58687 +
58688 +       if (!list_empty(&dataex.pending_pak)) {
58689 +               unsigned int left;
58690 +               pak = list_entry(dataex.pending_pak.next, struct packet, next);
58691 +
58692 +               left = pak->data_len - dataex.copied_so_far;
58693 +
58694 +               DPRINTK("size given by app: %d, available: %d\n", size, left);
58695 +
58696 +               ret_size = MIN(size, left);
58697 +
58698 +               ret_size = packet_read(pak, ret_size, data, size, 1);
58699 +               if (ret_size < 0) {
58700 +                       ret_size = -EFAULT;
58701 +               } else {
58702 +                       DPRINTK("Copied %d bytes to user buffer\n", ret_size);
58703 +
58704 +                       dataex.copied_so_far += ret_size;
58705 +                       if (dataex.copied_so_far >= pak->data_len + 4) {
58706 +                               DPRINTK("All data from this packet given to app.\n");
58707 +                               /* All data given to app */
58708 +
58709 +                               del_singleshot_timer_sync(&pak->
58710 +                                                         processing_timer);
58711 +                               list_del(&pak->next);
58712 +                               list_add_tail(&pak->next, &dataex.current_pak);
58713 +                               /*
58714 +                                * The more fontends that are handled at the same time,
58715 +                                * the more time we give the TPM to process the request.
58716 +                                */
58717 +                               mod_timer(&pak->processing_timer,
58718 +                                         jiffies + (num_frontends * 60 * HZ));
58719 +                               dataex.copied_so_far = 0;
58720 +                       }
58721 +               }
58722 +       }
58723 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58724 +
58725 +       DPRINTK("Returning result from read to app: %d\n", ret_size);
58726 +
58727 +       return ret_size;
58728 +}
58729 +
58730 +/*
58731 + * Write operation - only works after a previous read operation!
58732 + */
58733 +static ssize_t vtpm_op_write(struct file *file,
58734 +                            const char __user * data, size_t size,
58735 +                            loff_t * offset)
58736 +{
58737 +       struct packet *pak;
58738 +       int rc = 0;
58739 +       unsigned int off = 4;
58740 +       unsigned long flags;
58741 +       struct vtpm_resp_hdr vrh;
58742 +
58743 +       /*
58744 +        * Minimum required packet size is:
58745 +        * 4 bytes for instance number
58746 +        * 2 bytes for tag
58747 +        * 4 bytes for paramSize
58748 +        * 4 bytes for the ordinal
58749 +        * sum: 14 bytes
58750 +        */
58751 +       if (size < sizeof (vrh))
58752 +               return -EFAULT;
58753 +
58754 +       if (copy_from_user(&vrh, data, sizeof (vrh)))
58755 +               return -EFAULT;
58756 +
58757 +       /* malformed packet? */
58758 +       if ((off + ntohl(vrh.len_no)) != size)
58759 +               return -EFAULT;
58760 +
58761 +       write_lock_irqsave(&dataex.pak_lock, flags);
58762 +       pak = packet_find_instance(&dataex.current_pak,
58763 +                                  ntohl(vrh.instance_no));
58764 +
58765 +       if (pak == NULL) {
58766 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
58767 +               printk(KERN_ALERT "No associated packet! (inst=%d)\n",
58768 +                      ntohl(vrh.instance_no));
58769 +               return -EFAULT;
58770 +       }
58771 +
58772 +       del_singleshot_timer_sync(&pak->processing_timer);
58773 +       list_del(&pak->next);
58774 +
58775 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58776 +
58777 +       /*
58778 +        * The first 'offset' bytes must be the instance number - skip them.
58779 +        */
58780 +       size -= off;
58781 +
58782 +       rc = packet_write(pak, &data[off], size, 1);
58783 +
58784 +       if (rc > 0) {
58785 +               /* I neglected the first 4 bytes */
58786 +               rc += off;
58787 +       }
58788 +       packet_free(pak);
58789 +       return rc;
58790 +}
58791 +
58792 +static int vtpm_op_release(struct inode *inode, struct file *file)
58793 +{
58794 +       unsigned long flags;
58795 +
58796 +       vtpm_release_packets(NULL, 1);
58797 +       write_lock_irqsave(&dataex.pak_lock, flags);
58798 +       dataex.has_opener = 0;
58799 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58800 +       return 0;
58801 +}
58802 +
58803 +static unsigned int vtpm_op_poll(struct file *file,
58804 +                                struct poll_table_struct *pts)
58805 +{
58806 +       unsigned int flags = POLLOUT | POLLWRNORM;
58807 +
58808 +       poll_wait(file, &dataex.wait_queue, pts);
58809 +       if (!list_empty(&dataex.pending_pak)) {
58810 +               flags |= POLLIN | POLLRDNORM;
58811 +       }
58812 +       return flags;
58813 +}
58814 +
58815 +static struct file_operations vtpm_ops = {
58816 +       .owner = THIS_MODULE,
58817 +       .llseek = no_llseek,
58818 +       .open = vtpm_op_open,
58819 +       .read = vtpm_op_read,
58820 +       .write = vtpm_op_write,
58821 +       .release = vtpm_op_release,
58822 +       .poll = vtpm_op_poll,
58823 +};
58824 +
58825 +static struct miscdevice vtpms_miscdevice = {
58826 +       .minor = 225,
58827 +       .name = "vtpm",
58828 +       .fops = &vtpm_ops,
58829 +};
58830 +
58831 +/***************************************************************
58832 + Virtual TPM functions and data stuctures
58833 +***************************************************************/
58834 +
58835 +static u8 create_cmd[] = {
58836 +       1, 193,                 /* 0: TPM_TAG_RQU_COMMAMD */
58837 +       0, 0, 0, 19,            /* 2: length */
58838 +       0, 0, 0, 0x1,           /* 6: VTPM_ORD_OPEN */
58839 +       0,                      /* 10: VTPM type */
58840 +       0, 0, 0, 0,             /* 11: domain id */
58841 +       0, 0, 0, 0              /* 15: instance id */
58842 +};
58843 +
58844 +int tpmif_vtpm_open(tpmif_t * tpmif, domid_t domid, u32 instance)
58845 +{
58846 +       int rc = 0;
58847 +       struct packet *pak;
58848 +
58849 +       pak = packet_alloc(tpmif,
58850 +                          sizeof (create_cmd),
58851 +                          create_cmd[1],
58852 +                          PACKET_FLAG_DISCARD_RESPONSE |
58853 +                          PACKET_FLAG_CHECK_RESPONSESTATUS);
58854 +       if (pak) {
58855 +               u8 buf[sizeof (create_cmd)];
58856 +               u32 domid_no = htonl((u32) domid);
58857 +               u32 instance_no = htonl(instance);
58858 +
58859 +               memcpy(buf, create_cmd, sizeof (create_cmd));
58860 +
58861 +               memcpy(&buf[11], &domid_no, sizeof (u32));
58862 +               memcpy(&buf[15], &instance_no, sizeof (u32));
58863 +
58864 +               /* copy the buffer into the packet */
58865 +               rc = packet_set(pak, buf, sizeof (buf));
58866 +
58867 +               if (rc == 0) {
58868 +                       pak->tpm_instance = 0;
58869 +                       rc = vtpm_queue_packet(pak);
58870 +               }
58871 +               if (rc < 0) {
58872 +                       /* could not be queued or built */
58873 +                       packet_free(pak);
58874 +               }
58875 +       } else {
58876 +               rc = -ENOMEM;
58877 +       }
58878 +       return rc;
58879 +}
58880 +
58881 +static u8 destroy_cmd[] = {
58882 +       1, 193,                 /* 0: TPM_TAG_RQU_COMMAMD */
58883 +       0, 0, 0, 14,            /* 2: length */
58884 +       0, 0, 0, 0x2,           /* 6: VTPM_ORD_CLOSE */
58885 +       0, 0, 0, 0              /* 10: instance id */
58886 +};
58887 +
58888 +int tpmif_vtpm_close(u32 instid)
58889 +{
58890 +       int rc = 0;
58891 +       struct packet *pak;
58892 +
58893 +       pak = packet_alloc(NULL,
58894 +                          sizeof (destroy_cmd),
58895 +                          destroy_cmd[1], PACKET_FLAG_DISCARD_RESPONSE);
58896 +       if (pak) {
58897 +               u8 buf[sizeof (destroy_cmd)];
58898 +               u32 instid_no = htonl(instid);
58899 +
58900 +               memcpy(buf, destroy_cmd, sizeof (destroy_cmd));
58901 +               memcpy(&buf[10], &instid_no, sizeof (u32));
58902 +
58903 +               /* copy the buffer into the packet */
58904 +               rc = packet_set(pak, buf, sizeof (buf));
58905 +
58906 +               if (rc == 0) {
58907 +                       pak->tpm_instance = 0;
58908 +                       rc = vtpm_queue_packet(pak);
58909 +               }
58910 +               if (rc < 0) {
58911 +                       /* could not be queued or built */
58912 +                       packet_free(pak);
58913 +               }
58914 +       } else {
58915 +               rc = -ENOMEM;
58916 +       }
58917 +       return rc;
58918 +}
58919 +
58920 +/***************************************************************
58921 + Utility functions
58922 +***************************************************************/
58923 +
58924 +static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
58925 +{
58926 +       int rc;
58927 +       static const unsigned char tpm_error_message_fail[] = {
58928 +               0x00, 0x00,
58929 +               0x00, 0x00, 0x00, 0x0a,
58930 +               0x00, 0x00, 0x00, 0x09  /* TPM_FAIL */
58931 +       };
58932 +       unsigned char buffer[sizeof (tpm_error_message_fail)];
58933 +
58934 +       memcpy(buffer, tpm_error_message_fail,
58935 +              sizeof (tpm_error_message_fail));
58936 +       /*
58937 +        * Insert the right response tag depending on the given tag
58938 +        * All response tags are '+3' to the request tag.
58939 +        */
58940 +       buffer[1] = req_tag + 3;
58941 +
58942 +       /*
58943 +        * Write the data to shared memory and notify the front-end
58944 +        */
58945 +       rc = packet_write(pak, buffer, sizeof (buffer), 0);
58946 +
58947 +       return rc;
58948 +}
58949 +
58950 +static void _vtpm_release_packets(struct list_head *head,
58951 +                                 tpmif_t * tpmif, int send_msgs)
58952 +{
58953 +       struct packet *pak;
58954 +       struct list_head *pos,
58955 +                *tmp;
58956 +
58957 +       list_for_each_safe(pos, tmp, head) {
58958 +               pak = list_entry(pos, struct packet, next);
58959 +
58960 +               if (tpmif == NULL || pak->tpmif == tpmif) {
58961 +                       int can_send = 0;
58962 +
58963 +                       del_singleshot_timer_sync(&pak->processing_timer);
58964 +                       list_del(&pak->next);
58965 +
58966 +                       if (pak->tpmif && pak->tpmif->status == CONNECTED) {
58967 +                               can_send = 1;
58968 +                       }
58969 +
58970 +                       if (send_msgs && can_send) {
58971 +                               tpm_send_fail_message(pak, pak->req_tag);
58972 +                       }
58973 +                       packet_free(pak);
58974 +               }
58975 +       }
58976 +}
58977 +
58978 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
58979 +{
58980 +       unsigned long flags;
58981 +
58982 +       write_lock_irqsave(&dataex.pak_lock, flags);
58983 +
58984 +       _vtpm_release_packets(&dataex.pending_pak, tpmif, send_msgs);
58985 +       _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
58986 +
58987 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58988 +       return 0;
58989 +}
58990 +
58991 +static int vtpm_queue_packet(struct packet *pak)
58992 +{
58993 +       int rc = 0;
58994 +
58995 +       if (dataex.has_opener) {
58996 +               unsigned long flags;
58997 +
58998 +               write_lock_irqsave(&dataex.pak_lock, flags);
58999 +               list_add_tail(&pak->next, &dataex.pending_pak);
59000 +               /* give the TPM some time to pick up the request */
59001 +               mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
59002 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
59003 +
59004 +               wake_up_interruptible(&dataex.wait_queue);
59005 +       } else {
59006 +               rc = -EFAULT;
59007 +       }
59008 +       return rc;
59009 +}
59010 +
59011 +static int vtpm_receive(tpmif_t * tpmif, u32 size)
59012 +{
59013 +       int rc = 0;
59014 +       unsigned char buffer[10];
59015 +       __be32 *native_size;
59016 +       struct packet *pak = packet_alloc(tpmif, size, 0, 0);
59017 +
59018 +       if (!pak)
59019 +               return -ENOMEM;
59020 +       /*
59021 +        * Read 10 bytes from the received buffer to test its
59022 +        * content for validity.
59023 +        */
59024 +       if (sizeof (buffer) != packet_read(pak,
59025 +                                          sizeof (buffer), buffer,
59026 +                                          sizeof (buffer), 0)) {
59027 +               goto failexit;
59028 +       }
59029 +       /*
59030 +        * Reset the packet read pointer so we can read all its
59031 +        * contents again.
59032 +        */
59033 +       packet_reset(pak);
59034 +
59035 +       native_size = (__force __be32 *) (&buffer[4 + 2]);
59036 +       /*
59037 +        * Verify that the size of the packet is correct
59038 +        * as indicated and that there's actually someone reading packets.
59039 +        * The minimum size of the packet is '10' for tag, size indicator
59040 +        * and ordinal.
59041 +        */
59042 +       if (size < 10 ||
59043 +           be32_to_cpu(*native_size) != size ||
59044 +           0 == dataex.has_opener || tpmif->status != CONNECTED) {
59045 +               rc = -EINVAL;
59046 +               goto failexit;
59047 +       } else {
59048 +               rc = vtpm_queue_packet(pak);
59049 +               if (rc < 0)
59050 +                       goto failexit;
59051 +       }
59052 +       return 0;
59053 +
59054 +      failexit:
59055 +       if (pak) {
59056 +               tpm_send_fail_message(pak, buffer[4 + 1]);
59057 +               packet_free(pak);
59058 +       }
59059 +       return rc;
59060 +}
59061 +
59062 +/*
59063 + * Timeout function that gets invoked when a packet has not been processed
59064 + * during the timeout period.
59065 + * The packet must be on a list when this function is invoked. This
59066 + * also means that once its taken off a list, the timer must be
59067 + * destroyed as well.
59068 + */
59069 +static void processing_timeout(unsigned long ptr)
59070 +{
59071 +       struct packet *pak = (struct packet *)ptr;
59072 +       unsigned long flags;
59073 +
59074 +       write_lock_irqsave(&dataex.pak_lock, flags);
59075 +       /*
59076 +        * The packet needs to be searched whether it
59077 +        * is still on the list.
59078 +        */
59079 +       if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
59080 +           pak == packet_find_packet(&dataex.current_pak, pak)) {
59081 +               list_del(&pak->next);
59082 +               if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
59083 +                       tpm_send_fail_message(pak, pak->req_tag);
59084 +               }
59085 +               packet_free(pak);
59086 +       }
59087 +
59088 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
59089 +}
59090 +
59091 +static void tpm_tx_action(unsigned long unused);
59092 +static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
59093 +
59094 +static struct list_head tpm_schedule_list;
59095 +static spinlock_t tpm_schedule_list_lock;
59096 +
59097 +static inline void maybe_schedule_tx_action(void)
59098 +{
59099 +       smp_mb();
59100 +       tasklet_schedule(&tpm_tx_tasklet);
59101 +}
59102 +
59103 +static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
59104 +{
59105 +       return tpmif->list.next != NULL;
59106 +}
59107 +
59108 +static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
59109 +{
59110 +       spin_lock_irq(&tpm_schedule_list_lock);
59111 +       if (likely(__on_tpm_schedule_list(tpmif))) {
59112 +               list_del(&tpmif->list);
59113 +               tpmif->list.next = NULL;
59114 +               tpmif_put(tpmif);
59115 +       }
59116 +       spin_unlock_irq(&tpm_schedule_list_lock);
59117 +}
59118 +
59119 +static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
59120 +{
59121 +       if (__on_tpm_schedule_list(tpmif))
59122 +               return;
59123 +
59124 +       spin_lock_irq(&tpm_schedule_list_lock);
59125 +       if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
59126 +               list_add_tail(&tpmif->list, &tpm_schedule_list);
59127 +               tpmif_get(tpmif);
59128 +       }
59129 +       spin_unlock_irq(&tpm_schedule_list_lock);
59130 +}
59131 +
59132 +void tpmif_schedule_work(tpmif_t * tpmif)
59133 +{
59134 +       add_to_tpm_schedule_list_tail(tpmif);
59135 +       maybe_schedule_tx_action();
59136 +}
59137 +
59138 +void tpmif_deschedule_work(tpmif_t * tpmif)
59139 +{
59140 +       remove_from_tpm_schedule_list(tpmif);
59141 +}
59142 +
59143 +static void tpm_tx_action(unsigned long unused)
59144 +{
59145 +       struct list_head *ent;
59146 +       tpmif_t *tpmif;
59147 +       tpmif_tx_request_t *tx;
59148 +
59149 +       DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
59150 +
59151 +       while (!list_empty(&tpm_schedule_list)) {
59152 +               /* Get a tpmif from the list with work to do. */
59153 +               ent = tpm_schedule_list.next;
59154 +               tpmif = list_entry(ent, tpmif_t, list);
59155 +               tpmif_get(tpmif);
59156 +               remove_from_tpm_schedule_list(tpmif);
59157 +
59158 +               tx = &tpmif->tx->ring[0].req;
59159 +
59160 +               /* pass it up */
59161 +               vtpm_receive(tpmif, tx->size);
59162 +
59163 +               tpmif_put(tpmif);
59164 +       }
59165 +}
59166 +
59167 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
59168 +{
59169 +       tpmif_t *tpmif = (tpmif_t *) dev_id;
59170 +
59171 +       add_to_tpm_schedule_list_tail(tpmif);
59172 +       maybe_schedule_tx_action();
59173 +       return IRQ_HANDLED;
59174 +}
59175 +
59176 +static int __init tpmback_init(void)
59177 +{
59178 +       int rc;
59179 +
59180 +       if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
59181 +               printk(KERN_ALERT
59182 +                      "Could not register misc device for TPM BE.\n");
59183 +               return rc;
59184 +       }
59185 +
59186 +       INIT_LIST_HEAD(&dataex.pending_pak);
59187 +       INIT_LIST_HEAD(&dataex.current_pak);
59188 +       dataex.has_opener = 0;
59189 +       rwlock_init(&dataex.pak_lock);
59190 +       init_waitqueue_head(&dataex.wait_queue);
59191 +
59192 +       spin_lock_init(&tpm_schedule_list_lock);
59193 +       INIT_LIST_HEAD(&tpm_schedule_list);
59194 +
59195 +       tpmif_interface_init();
59196 +       tpmif_xenbus_init();
59197 +
59198 +       printk(KERN_ALERT "Successfully initialized TPM backend driver.\n");
59199 +
59200 +       return 0;
59201 +}
59202 +
59203 +module_init(tpmback_init);
59204 +
59205 +static void __exit tpmback_exit(void)
59206 +{
59207 +       tpmif_xenbus_exit();
59208 +       tpmif_interface_exit();
59209 +       misc_deregister(&vtpms_miscdevice);
59210 +}
59211 +
59212 +module_exit(tpmback_exit);
59213 +
59214 +MODULE_LICENSE("Dual BSD/GPL");
59215 +
59216 +/*
59217 + * Local variables:
59218 + *  c-file-style: "linux"
59219 + *  indent-tabs-mode: t
59220 + *  c-indent-level: 8
59221 + *  c-basic-offset: 8
59222 + *  tab-width: 8
59223 + * End:
59224 + */
59225 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmback/xenbus.c tmp-linux-2.6-xen.patch/drivers/xen/tpmback/xenbus.c
59226 --- ref-linux-2.6.16.9/drivers/xen/tpmback/xenbus.c     1970-01-01 01:00:00.000000000 +0100
59227 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmback/xenbus.c        2006-04-10 00:05:52.000000000 +0200
59228 @@ -0,0 +1,328 @@
59229 +/*  Xenbus code for tpmif backend
59230 +    Copyright (C) 2005 IBM Corporation
59231 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
59232 +
59233 +    This program is free software; you can redistribute it and/or modify
59234 +    it under the terms of the GNU General Public License as published by
59235 +    the Free Software Foundation; either version 2 of the License, or
59236 +    (at your option) any later version.
59237 +
59238 +    This program is distributed in the hope that it will be useful,
59239 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
59240 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
59241 +    GNU General Public License for more details.
59242 +
59243 +    You should have received a copy of the GNU General Public License
59244 +    along with this program; if not, write to the Free Software
59245 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
59246 +*/
59247 +#include <stdarg.h>
59248 +#include <linux/module.h>
59249 +#include <xen/xenbus.h>
59250 +#include "common.h"
59251 +
59252 +struct backend_info
59253 +{
59254 +       struct xenbus_device *dev;
59255 +
59256 +       /* our communications channel */
59257 +       tpmif_t *tpmif;
59258 +
59259 +       long int frontend_id;
59260 +       long int instance; // instance of TPM
59261 +       u8 is_instance_set;// whether instance number has been set
59262 +
59263 +       /* watch front end for changes */
59264 +       struct xenbus_watch backend_watch;
59265 +       XenbusState frontend_state;
59266 +};
59267 +
59268 +static void maybe_connect(struct backend_info *be);
59269 +static void connect(struct backend_info *be);
59270 +static int connect_ring(struct backend_info *be);
59271 +static void backend_changed(struct xenbus_watch *watch,
59272 +                            const char **vec, unsigned int len);
59273 +static void frontend_changed(struct xenbus_device *dev,
59274 +                             XenbusState frontend_state);
59275 +
59276 +static int tpmback_remove(struct xenbus_device *dev)
59277 +{
59278 +       struct backend_info *be = dev->data;
59279 +
59280 +       if (be->backend_watch.node) {
59281 +               unregister_xenbus_watch(&be->backend_watch);
59282 +               kfree(be->backend_watch.node);
59283 +               be->backend_watch.node = NULL;
59284 +       }
59285 +       if (be->tpmif) {
59286 +               tpmif_put(be->tpmif);
59287 +               be->tpmif = NULL;
59288 +       }
59289 +       kfree(be);
59290 +       dev->data = NULL;
59291 +       return 0;
59292 +}
59293 +
59294 +static int tpmback_probe(struct xenbus_device *dev,
59295 +                         const struct xenbus_device_id *id)
59296 +{
59297 +       int err;
59298 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
59299 +                                         GFP_KERNEL);
59300 +
59301 +       if (!be) {
59302 +               xenbus_dev_fatal(dev, -ENOMEM,
59303 +                                "allocating backend structure");
59304 +               return -ENOMEM;
59305 +       }
59306 +
59307 +       be->is_instance_set = 0;
59308 +       be->dev = dev;
59309 +       dev->data = be;
59310 +
59311 +       err = xenbus_watch_path2(dev, dev->nodename,
59312 +                               "instance", &be->backend_watch,
59313 +                               backend_changed);
59314 +       if (err) {
59315 +               goto fail;
59316 +       }
59317 +
59318 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
59319 +       if (err) {
59320 +               goto fail;
59321 +       }
59322 +       return 0;
59323 +fail:
59324 +       tpmback_remove(dev);
59325 +       return err;
59326 +}
59327 +
59328 +
59329 +static void backend_changed(struct xenbus_watch *watch,
59330 +                            const char **vec, unsigned int len)
59331 +{
59332 +       int err;
59333 +       long instance;
59334 +       struct backend_info *be
59335 +               = container_of(watch, struct backend_info, backend_watch);
59336 +       struct xenbus_device *dev = be->dev;
59337 +
59338 +       err = xenbus_scanf(XBT_NULL, dev->nodename,
59339 +                          "instance","%li", &instance);
59340 +       if (XENBUS_EXIST_ERR(err)) {
59341 +               return;
59342 +       }
59343 +
59344 +       if (err != 1) {
59345 +               xenbus_dev_fatal(dev, err, "reading instance");
59346 +               return;
59347 +       }
59348 +
59349 +       if (be->is_instance_set != 0 && be->instance != instance) {
59350 +               printk(KERN_WARNING
59351 +                      "tpmback: changing instance (from %ld to %ld) "
59352 +                      "not allowed.\n",
59353 +                      be->instance, instance);
59354 +               return;
59355 +       }
59356 +
59357 +       if (be->is_instance_set == 0) {
59358 +               be->tpmif = tpmif_find(dev->otherend_id,
59359 +                                      instance);
59360 +               if (IS_ERR(be->tpmif)) {
59361 +                       err = PTR_ERR(be->tpmif);
59362 +                       be->tpmif = NULL;
59363 +                       xenbus_dev_fatal(dev,err,"creating block interface");
59364 +                       return;
59365 +               }
59366 +               be->instance = instance;
59367 +               be->is_instance_set = 1;
59368 +
59369 +               /*
59370 +                * There's an unfortunate problem:
59371 +                * Sometimes after a suspend/resume the
59372 +                * state switch to XenbusStateInitialised happens
59373 +                * *before* I get to this point here. Since then
59374 +                * the connect_ring() must have failed (be->tpmif is
59375 +                * still NULL), I just call it here again indirectly.
59376 +                */
59377 +               if (be->frontend_state == XenbusStateInitialised) {
59378 +                       frontend_changed(dev, be->frontend_state);
59379 +               }
59380 +       }
59381 +}
59382 +
59383 +
59384 +static void frontend_changed(struct xenbus_device *dev,
59385 +                             XenbusState frontend_state)
59386 +{
59387 +       struct backend_info *be = dev->data;
59388 +       int err;
59389 +
59390 +       be->frontend_state = frontend_state;
59391 +
59392 +       switch (frontend_state) {
59393 +       case XenbusStateInitialising:
59394 +       case XenbusStateConnected:
59395 +               break;
59396 +
59397 +       case XenbusStateInitialised:
59398 +               err = connect_ring(be);
59399 +               if (err) {
59400 +                       return;
59401 +               }
59402 +               maybe_connect(be);
59403 +               break;
59404 +
59405 +       case XenbusStateClosing:
59406 +               xenbus_switch_state(dev, XenbusStateClosing);
59407 +               break;
59408 +
59409 +       case XenbusStateClosed:
59410 +               /*
59411 +                * Notify the vTPM manager about the front-end
59412 +                * having left.
59413 +                */
59414 +               tpmif_vtpm_close(be->instance);
59415 +               device_unregister(&be->dev->dev);
59416 +               break;
59417 +
59418 +       case XenbusStateUnknown:
59419 +       case XenbusStateInitWait:
59420 +       default:
59421 +               xenbus_dev_fatal(dev, -EINVAL,
59422 +                                "saw state %d at frontend",
59423 +                                frontend_state);
59424 +               break;
59425 +       }
59426 +}
59427 +
59428 +
59429 +
59430 +static void maybe_connect(struct backend_info *be)
59431 +{
59432 +       int err;
59433 +
59434 +       if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
59435 +               return;
59436 +
59437 +       connect(be);
59438 +
59439 +       /*
59440 +        * Notify the vTPM manager about a new front-end.
59441 +        */
59442 +       err = tpmif_vtpm_open(be->tpmif,
59443 +                             be->frontend_id,
59444 +                             be->instance);
59445 +       if (err) {
59446 +               xenbus_dev_error(be->dev, err,
59447 +                                "queueing vtpm open packet");
59448 +               /*
59449 +                * Should close down this device and notify FE
59450 +                * about closure.
59451 +                */
59452 +               return;
59453 +       }
59454 +}
59455 +
59456 +
59457 +static void connect(struct backend_info *be)
59458 +{
59459 +       xenbus_transaction_t xbt;
59460 +       int err;
59461 +       struct xenbus_device *dev = be->dev;
59462 +       unsigned long ready = 1;
59463 +
59464 +again:
59465 +       err = xenbus_transaction_start(&xbt);
59466 +       if (err) {
59467 +               xenbus_dev_fatal(be->dev, err, "starting transaction");
59468 +               return;
59469 +       }
59470 +
59471 +       err = xenbus_printf(xbt, be->dev->nodename,
59472 +                           "ready", "%lu", ready);
59473 +       if (err) {
59474 +               xenbus_dev_fatal(be->dev, err, "writing 'ready'");
59475 +               goto abort;
59476 +       }
59477 +
59478 +       err = xenbus_transaction_end(xbt, 0);
59479 +       if (err == -EAGAIN)
59480 +               goto again;
59481 +       if (err)
59482 +               xenbus_dev_fatal(be->dev, err, "end of transaction");
59483 +
59484 +       err = xenbus_switch_state(dev, XenbusStateConnected);
59485 +       if (!err)
59486 +               be->tpmif->status = CONNECTED;
59487 +       return;
59488 +abort:
59489 +       xenbus_transaction_end(xbt, 1);
59490 +}
59491 +
59492 +
59493 +static int connect_ring(struct backend_info *be)
59494 +{
59495 +       struct xenbus_device *dev = be->dev;
59496 +       unsigned long ring_ref;
59497 +       unsigned int evtchn;
59498 +       int err;
59499 +
59500 +       err = xenbus_gather(XBT_NULL, dev->otherend,
59501 +                           "ring-ref", "%lu", &ring_ref,
59502 +                           "event-channel", "%u", &evtchn, NULL);
59503 +       if (err) {
59504 +               xenbus_dev_error(dev, err,
59505 +                                "reading %s/ring-ref and event-channel",
59506 +                                dev->otherend);
59507 +               return err;
59508 +       }
59509 +       if (be->tpmif != NULL) {
59510 +               err = tpmif_map(be->tpmif, ring_ref, evtchn);
59511 +               if (err) {
59512 +                       xenbus_dev_error(dev, err,
59513 +                                        "mapping shared-frame %lu port %u",
59514 +                                        ring_ref, evtchn);
59515 +                       return err;
59516 +               }
59517 +       }
59518 +       return 0;
59519 +}
59520 +
59521 +
59522 +static struct xenbus_device_id tpmback_ids[] = {
59523 +       { "vtpm" },
59524 +       { "" }
59525 +};
59526 +
59527 +
59528 +static struct xenbus_driver tpmback = {
59529 +       .name = "vtpm",
59530 +       .owner = THIS_MODULE,
59531 +       .ids = tpmback_ids,
59532 +       .probe = tpmback_probe,
59533 +       .remove = tpmback_remove,
59534 +       .otherend_changed = frontend_changed,
59535 +};
59536 +
59537 +
59538 +void tpmif_xenbus_init(void)
59539 +{
59540 +       xenbus_register_backend(&tpmback);
59541 +}
59542 +
59543 +void tpmif_xenbus_exit(void)
59544 +{
59545 +       xenbus_unregister_driver(&tpmback);
59546 +}
59547 +
59548 +/*
59549 + * Local variables:
59550 + *  c-file-style: "linux"
59551 + *  indent-tabs-mode: t
59552 + *  c-indent-level: 8
59553 + *  c-basic-offset: 8
59554 + *  tab-width: 8
59555 + * End:
59556 + */
59557 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmfront/Makefile tmp-linux-2.6-xen.patch/drivers/xen/tpmfront/Makefile
59558 --- ref-linux-2.6.16.9/drivers/xen/tpmfront/Makefile    1970-01-01 01:00:00.000000000 +0100
59559 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmfront/Makefile       2006-04-10 00:05:52.000000000 +0200
59560 @@ -0,0 +1,2 @@
59561 +
59562 +obj-$(CONFIG_XEN_TPMDEV_FRONTEND)      += tpmfront.o
59563 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmfront/tpmfront.c tmp-linux-2.6-xen.patch/drivers/xen/tpmfront/tpmfront.c
59564 --- ref-linux-2.6.16.9/drivers/xen/tpmfront/tpmfront.c  1970-01-01 01:00:00.000000000 +0100
59565 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmfront/tpmfront.c     2006-04-10 00:05:52.000000000 +0200
59566 @@ -0,0 +1,731 @@
59567 +/*
59568 + * Copyright (c) 2005, IBM Corporation
59569 + *
59570 + * Author: Stefan Berger, stefanb@us.ibm.com
59571 + * Grant table support: Mahadevan Gomathisankaran
59572 + *
59573 + * This code has been derived from drivers/xen/netfront/netfront.c
59574 + *
59575 + * Copyright (c) 2002-2004, K A Fraser
59576 + *
59577 + * This program is free software; you can redistribute it and/or
59578 + * modify it under the terms of the GNU General Public License version 2
59579 + * as published by the Free Software Foundation; or, when distributed
59580 + * separately from the Linux kernel or incorporated into other
59581 + * software packages, subject to the following license:
59582 + * 
59583 + * Permission is hereby granted, free of charge, to any person obtaining a copy
59584 + * of this source file (the "Software"), to deal in the Software without
59585 + * restriction, including without limitation the rights to use, copy, modify,
59586 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
59587 + * and to permit persons to whom the Software is furnished to do so, subject to
59588 + * the following conditions:
59589 + *
59590 + * The above copyright notice and this permission notice shall be included in
59591 + * all copies or substantial portions of the Software.
59592 + *
59593 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
59594 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59595 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
59596 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
59597 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
59598 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
59599 + * IN THE SOFTWARE.
59600 + */
59601 +
59602 +#include <linux/config.h>
59603 +#include <linux/module.h>
59604 +#include <linux/version.h>
59605 +#include <linux/kernel.h>
59606 +#include <linux/slab.h>
59607 +#include <linux/errno.h>
59608 +#include <linux/interrupt.h>
59609 +#include <linux/init.h>
59610 +#include <xen/tpmfe.h>
59611 +#include <linux/err.h>
59612 +#include <linux/mutex.h>
59613 +#include <asm/io.h>
59614 +#include <xen/evtchn.h>
59615 +#include <xen/interface/grant_table.h>
59616 +#include <xen/interface/io/tpmif.h>
59617 +#include <asm/uaccess.h>
59618 +#include <xen/xenbus.h>
59619 +#include <xen/interface/grant_table.h>
59620 +
59621 +#include "tpmfront.h"
59622 +
59623 +#undef DEBUG
59624 +
59625 +/* locally visible variables */
59626 +static grant_ref_t gref_head;
59627 +static struct tpm_private *my_priv;
59628 +
59629 +/* local function prototypes */
59630 +static irqreturn_t tpmif_int(int irq,
59631 +                             void *tpm_priv,
59632 +                             struct pt_regs *ptregs);
59633 +static void tpmif_rx_action(unsigned long unused);
59634 +static void tpmif_connect(struct tpm_private *tp, domid_t domid);
59635 +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
59636 +static int tpm_allocate_buffers(struct tpm_private *tp);
59637 +static void tpmif_set_connected_state(struct tpm_private *tp,
59638 +                                      u8 newstate);
59639 +static int tpm_xmit(struct tpm_private *tp,
59640 +                    const u8 * buf, size_t count, int userbuffer,
59641 +                    void *remember);
59642 +
59643 +#define DPRINTK(fmt, args...) \
59644 +    pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
59645 +#define IPRINTK(fmt, args...) \
59646 +    printk(KERN_INFO "xen_tpm_fr: " fmt, ##args)
59647 +#define WPRINTK(fmt, args...) \
59648 +    printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args)
59649 +
59650 +
59651 +static inline int
59652 +tx_buffer_copy(struct tx_buffer *txb, const u8 * src, int len,
59653 +               int isuserbuffer)
59654 +{
59655 +       int copied = len;
59656 +
59657 +       if (len > txb->size) {
59658 +               copied = txb->size;
59659 +       }
59660 +       if (isuserbuffer) {
59661 +               if (copy_from_user(txb->data, src, copied))
59662 +                       return -EFAULT;
59663 +       } else {
59664 +               memcpy(txb->data, src, copied);
59665 +       }
59666 +       txb->len = len;
59667 +       return copied;
59668 +}
59669 +
59670 +static inline struct tx_buffer *tx_buffer_alloc(void)
59671 +{
59672 +       struct tx_buffer *txb = kzalloc(sizeof (struct tx_buffer),
59673 +                                       GFP_KERNEL);
59674 +
59675 +       if (txb) {
59676 +               txb->len = 0;
59677 +               txb->size = PAGE_SIZE;
59678 +               txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
59679 +               if (txb->data == NULL) {
59680 +                       kfree(txb);
59681 +                       txb = NULL;
59682 +               }
59683 +       }
59684 +       return txb;
59685 +}
59686 +
59687 +
59688 +/**************************************************************
59689 + Utility function for the tpm_private structure
59690 +**************************************************************/
59691 +static inline void tpm_private_init(struct tpm_private *tp)
59692 +{
59693 +       spin_lock_init(&tp->tx_lock);
59694 +       init_waitqueue_head(&tp->wait_q);
59695 +}
59696 +
59697 +static struct tpm_private *tpm_private_get(void)
59698 +{
59699 +       if (!my_priv) {
59700 +               my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
59701 +               if (my_priv) {
59702 +                       tpm_private_init(my_priv);
59703 +               }
59704 +       }
59705 +       return my_priv;
59706 +}
59707 +
59708 +static inline void tpm_private_free(void)
59709 +{
59710 +       kfree(my_priv);
59711 +       my_priv = NULL;
59712 +}
59713 +
59714 +/**************************************************************
59715 +
59716 + The interface to let the tpm plugin register its callback
59717 + function and send data to another partition using this module
59718 +
59719 +**************************************************************/
59720 +
59721 +static DEFINE_MUTEX(upperlayer_lock);
59722 +static DEFINE_MUTEX(suspend_lock);
59723 +static struct tpmfe_device *upperlayer_tpmfe;
59724 +
59725 +/*
59726 + * Send data via this module by calling this function
59727 + */
59728 +int tpm_fe_send(struct tpm_private *tp, const u8 * buf, size_t count, void *ptr)
59729 +{
59730 +       int sent;
59731 +
59732 +       mutex_lock(&suspend_lock);
59733 +       sent = tpm_xmit(tp, buf, count, 0, ptr);
59734 +       mutex_unlock(&suspend_lock);
59735 +
59736 +       return sent;
59737 +}
59738 +EXPORT_SYMBOL(tpm_fe_send);
59739 +
59740 +/*
59741 + * Register a callback for receiving data from this module
59742 + */
59743 +int tpm_fe_register_receiver(struct tpmfe_device *tpmfe_dev)
59744 +{
59745 +       int rc = 0;
59746 +
59747 +       mutex_lock(&upperlayer_lock);
59748 +       if (NULL == upperlayer_tpmfe) {
59749 +               upperlayer_tpmfe = tpmfe_dev;
59750 +               tpmfe_dev->max_tx_size = TPMIF_TX_RING_SIZE * PAGE_SIZE;
59751 +               tpmfe_dev->tpm_private = tpm_private_get();
59752 +               if (!tpmfe_dev->tpm_private) {
59753 +                       rc = -ENOMEM;
59754 +               }
59755 +       } else {
59756 +               rc = -EBUSY;
59757 +       }
59758 +       mutex_unlock(&upperlayer_lock);
59759 +       return rc;
59760 +}
59761 +EXPORT_SYMBOL(tpm_fe_register_receiver);
59762 +
59763 +/*
59764 + * Unregister the callback for receiving data from this module
59765 + */
59766 +void tpm_fe_unregister_receiver(void)
59767 +{
59768 +       mutex_lock(&upperlayer_lock);
59769 +       upperlayer_tpmfe = NULL;
59770 +       mutex_unlock(&upperlayer_lock);
59771 +}
59772 +EXPORT_SYMBOL(tpm_fe_unregister_receiver);
59773 +
59774 +/*
59775 + * Call this function to send data to the upper layer's
59776 + * registered receiver function.
59777 + */
59778 +static int tpm_fe_send_upperlayer(const u8 * buf, size_t count,
59779 +                                  const void *ptr)
59780 +{
59781 +       int rc = 0;
59782 +
59783 +       mutex_lock(&upperlayer_lock);
59784 +
59785 +       if (upperlayer_tpmfe && upperlayer_tpmfe->receive)
59786 +               rc = upperlayer_tpmfe->receive(buf, count, ptr);
59787 +
59788 +       mutex_unlock(&upperlayer_lock);
59789 +       return rc;
59790 +}
59791 +
59792 +/**************************************************************
59793 + XENBUS support code
59794 +**************************************************************/
59795 +
59796 +static int setup_tpmring(struct xenbus_device *dev,
59797 +                         struct tpm_private *tp)
59798 +{
59799 +       tpmif_tx_interface_t *sring;
59800 +       int err;
59801 +
59802 +       sring = (void *)__get_free_page(GFP_KERNEL);
59803 +       if (!sring) {
59804 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
59805 +               return -ENOMEM;
59806 +       }
59807 +       tp->tx = sring;
59808 +
59809 +       tpm_allocate_buffers(tp);
59810 +
59811 +       err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
59812 +       if (err < 0) {
59813 +               free_page((unsigned long)sring);
59814 +               tp->tx = NULL;
59815 +               xenbus_dev_fatal(dev, err, "allocating grant reference");
59816 +               goto fail;
59817 +       }
59818 +       tp->ring_ref = err;
59819 +
59820 +       err = xenbus_alloc_evtchn(dev, &tp->evtchn);
59821 +       if (err)
59822 +               goto fail;
59823 +
59824 +       tpmif_connect(tp, dev->otherend_id);
59825 +
59826 +       return 0;
59827 +fail:
59828 +       return err;
59829 +}
59830 +
59831 +
59832 +static void destroy_tpmring(struct tpm_private *tp)
59833 +{
59834 +       tpmif_set_connected_state(tp, 0);
59835 +       if (tp->tx != NULL) {
59836 +               gnttab_end_foreign_access(tp->ring_ref, 0,
59837 +                                         (unsigned long)tp->tx);
59838 +               tp->tx = NULL;
59839 +       }
59840 +
59841 +       if (tp->irq)
59842 +               unbind_from_irqhandler(tp->irq, NULL);
59843 +       tp->evtchn = tp->irq = 0;
59844 +}
59845 +
59846 +
59847 +static int talk_to_backend(struct xenbus_device *dev,
59848 +                           struct tpm_private *tp)
59849 +{
59850 +       const char *message = NULL;
59851 +       int err;
59852 +       xenbus_transaction_t xbt;
59853 +
59854 +       err = setup_tpmring(dev, tp);
59855 +       if (err) {
59856 +               xenbus_dev_fatal(dev, err, "setting up ring");
59857 +               goto out;
59858 +       }
59859 +
59860 +again:
59861 +       err = xenbus_transaction_start(&xbt);
59862 +       if (err) {
59863 +               xenbus_dev_fatal(dev, err, "starting transaction");
59864 +               goto destroy_tpmring;
59865 +       }
59866 +
59867 +       err = xenbus_printf(xbt, dev->nodename,
59868 +                           "ring-ref","%u", tp->ring_ref);
59869 +       if (err) {
59870 +               message = "writing ring-ref";
59871 +               goto abort_transaction;
59872 +       }
59873 +
59874 +       err = xenbus_printf(xbt, dev->nodename,
59875 +                           "event-channel", "%u", tp->evtchn);
59876 +       if (err) {
59877 +               message = "writing event-channel";
59878 +               goto abort_transaction;
59879 +       }
59880 +
59881 +       err = xenbus_printf(xbt, dev->nodename,
59882 +                           "state", "%d", XenbusStateInitialised);
59883 +       if (err) {
59884 +               goto abort_transaction;
59885 +       }
59886 +
59887 +       err = xenbus_transaction_end(xbt, 0);
59888 +       if (err == -EAGAIN)
59889 +               goto again;
59890 +       if (err) {
59891 +               xenbus_dev_fatal(dev, err, "completing transaction");
59892 +               goto destroy_tpmring;
59893 +       }
59894 +       return 0;
59895 +
59896 +abort_transaction:
59897 +       xenbus_transaction_end(xbt, 1);
59898 +       if (message)
59899 +               xenbus_dev_error(dev, err, "%s", message);
59900 +destroy_tpmring:
59901 +       destroy_tpmring(tp);
59902 +out:
59903 +       return err;
59904 +}
59905 +
59906 +/**
59907 + * Callback received when the backend's state changes.
59908 + */
59909 +static void backend_changed(struct xenbus_device *dev,
59910 +                           XenbusState backend_state)
59911 +{
59912 +       struct tpm_private *tp = dev->data;
59913 +       DPRINTK("\n");
59914 +
59915 +       switch (backend_state) {
59916 +       case XenbusStateInitialising:
59917 +       case XenbusStateInitWait:
59918 +       case XenbusStateInitialised:
59919 +       case XenbusStateUnknown:
59920 +               break;
59921 +
59922 +       case XenbusStateConnected:
59923 +               tpmif_set_connected_state(tp, 1);
59924 +               break;
59925 +
59926 +       case XenbusStateClosing:
59927 +               tpmif_set_connected_state(tp, 0);
59928 +               break;
59929 +
59930 +       case XenbusStateClosed:
59931 +               if (tp->is_suspended == 0) {
59932 +                       device_unregister(&dev->dev);
59933 +               }
59934 +               break;
59935 +       }
59936 +}
59937 +
59938 +
59939 +static int tpmfront_probe(struct xenbus_device *dev,
59940 +                          const struct xenbus_device_id *id)
59941 +{
59942 +       int err;
59943 +       int handle;
59944 +       struct tpm_private *tp = tpm_private_get();
59945 +
59946 +       err = xenbus_scanf(XBT_NULL, dev->nodename,
59947 +                          "handle", "%i", &handle);
59948 +       if (XENBUS_EXIST_ERR(err))
59949 +               return err;
59950 +
59951 +       if (err < 0) {
59952 +               xenbus_dev_fatal(dev,err,"reading virtual-device");
59953 +               return err;
59954 +       }
59955 +
59956 +       tp->dev = dev;
59957 +       dev->data = tp;
59958 +
59959 +       err = talk_to_backend(dev, tp);
59960 +       if (err) {
59961 +               tpm_private_free();
59962 +               dev->data = NULL;
59963 +               return err;
59964 +       }
59965 +       return 0;
59966 +}
59967 +
59968 +
59969 +static int tpmfront_remove(struct xenbus_device *dev)
59970 +{
59971 +       struct tpm_private *tp = dev->data;
59972 +       destroy_tpmring(tp);
59973 +       return 0;
59974 +}
59975 +
59976 +static int
59977 +tpmfront_suspend(struct xenbus_device *dev)
59978 +{
59979 +       struct tpm_private *tp = dev->data;
59980 +       u32 ctr;
59981 +
59982 +       /* lock, so no app can send */
59983 +       mutex_lock(&suspend_lock);
59984 +       tp->is_suspended = 1;
59985 +
59986 +       for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 25; ctr++) {
59987 +               if ((ctr % 10) == 0)
59988 +                       printk("TPM-FE [INFO]: Waiting for outstanding request.\n");
59989 +               /*
59990 +                * Wait for a request to be responded to.
59991 +                */
59992 +               interruptible_sleep_on_timeout(&tp->wait_q, 100);
59993 +       }
59994 +
59995 +       if (atomic_read(&tp->tx_busy)) {
59996 +               /*
59997 +                * A temporary work-around.
59998 +                */
59999 +               printk("TPM-FE [WARNING]: Resetting busy flag.");
60000 +               atomic_set(&tp->tx_busy, 0);
60001 +       }
60002 +
60003 +       return 0;
60004 +}
60005 +
60006 +static int
60007 +tpmfront_resume(struct xenbus_device *dev)
60008 +{
60009 +       struct tpm_private *tp = dev->data;
60010 +       return talk_to_backend(dev, tp);
60011 +}
60012 +
60013 +static void
60014 +tpmif_connect(struct tpm_private *tp, domid_t domid)
60015 +{
60016 +       int err;
60017 +
60018 +       tp->backend_id = domid;
60019 +
60020 +       err = bind_evtchn_to_irqhandler(tp->evtchn,
60021 +                                       tpmif_int, SA_SAMPLE_RANDOM, "tpmif",
60022 +                                       tp);
60023 +       if (err <= 0) {
60024 +               WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
60025 +               return;
60026 +       }
60027 +
60028 +       tp->irq = err;
60029 +}
60030 +
60031 +static struct xenbus_device_id tpmfront_ids[] = {
60032 +       { "vtpm" },
60033 +       { "" }
60034 +};
60035 +
60036 +static struct xenbus_driver tpmfront = {
60037 +       .name = "vtpm",
60038 +       .owner = THIS_MODULE,
60039 +       .ids = tpmfront_ids,
60040 +       .probe = tpmfront_probe,
60041 +       .remove =  tpmfront_remove,
60042 +       .resume = tpmfront_resume,
60043 +       .otherend_changed = backend_changed,
60044 +       .suspend = tpmfront_suspend,
60045 +};
60046 +
60047 +static void __init init_tpm_xenbus(void)
60048 +{
60049 +       xenbus_register_frontend(&tpmfront);
60050 +}
60051 +
60052 +static void __exit exit_tpm_xenbus(void)
60053 +{
60054 +       xenbus_unregister_driver(&tpmfront);
60055 +}
60056 +
60057 +
60058 +static int
60059 +tpm_allocate_buffers(struct tpm_private *tp)
60060 +{
60061 +       unsigned int i;
60062 +
60063 +       for (i = 0; i < TPMIF_TX_RING_SIZE; i++)
60064 +               tp->tx_buffers[i] = tx_buffer_alloc();
60065 +       return 1;
60066 +}
60067 +
60068 +static void
60069 +tpmif_rx_action(unsigned long priv)
60070 +{
60071 +       struct tpm_private *tp = (struct tpm_private *)priv;
60072 +
60073 +       int i = 0;
60074 +       unsigned int received;
60075 +       unsigned int offset = 0;
60076 +       u8 *buffer;
60077 +       tpmif_tx_request_t *tx;
60078 +       tx = &tp->tx->ring[i].req;
60079 +
60080 +       received = tx->size;
60081 +
60082 +       buffer = kmalloc(received, GFP_KERNEL);
60083 +       if (NULL == buffer) {
60084 +               goto exit;
60085 +       }
60086 +
60087 +       for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
60088 +               struct tx_buffer *txb = tp->tx_buffers[i];
60089 +               tpmif_tx_request_t *tx;
60090 +               unsigned int tocopy;
60091 +
60092 +               tx = &tp->tx->ring[i].req;
60093 +               tocopy = tx->size;
60094 +               if (tocopy > PAGE_SIZE) {
60095 +                       tocopy = PAGE_SIZE;
60096 +               }
60097 +
60098 +               memcpy(&buffer[offset], txb->data, tocopy);
60099 +
60100 +               gnttab_release_grant_reference(&gref_head, tx->ref);
60101 +
60102 +               offset += tocopy;
60103 +       }
60104 +
60105 +       tpm_fe_send_upperlayer(buffer, received, tp->tx_remember);
60106 +       kfree(buffer);
60107 +
60108 +exit:
60109 +       atomic_set(&tp->tx_busy, 0);
60110 +       wake_up_interruptible(&tp->wait_q);
60111 +}
60112 +
60113 +
60114 +static irqreturn_t
60115 +tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
60116 +{
60117 +       struct tpm_private *tp = tpm_priv;
60118 +       unsigned long flags;
60119 +
60120 +       spin_lock_irqsave(&tp->tx_lock, flags);
60121 +       tpmif_rx_tasklet.data = (unsigned long)tp;
60122 +       tasklet_schedule(&tpmif_rx_tasklet);
60123 +       spin_unlock_irqrestore(&tp->tx_lock, flags);
60124 +
60125 +       return IRQ_HANDLED;
60126 +}
60127 +
60128 +
60129 +static int
60130 +tpm_xmit(struct tpm_private *tp,
60131 +         const u8 * buf, size_t count, int isuserbuffer,
60132 +         void *remember)
60133 +{
60134 +       tpmif_tx_request_t *tx;
60135 +       TPMIF_RING_IDX i;
60136 +       unsigned int offset = 0;
60137 +
60138 +       spin_lock_irq(&tp->tx_lock);
60139 +
60140 +       if (unlikely(atomic_read(&tp->tx_busy))) {
60141 +               printk("tpm_xmit: There's an outstanding request/response "
60142 +                      "on the way!\n");
60143 +               spin_unlock_irq(&tp->tx_lock);
60144 +               return -EBUSY;
60145 +       }
60146 +
60147 +       if (tp->is_connected != 1) {
60148 +               spin_unlock_irq(&tp->tx_lock);
60149 +               return -EIO;
60150 +       }
60151 +
60152 +       for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
60153 +               struct tx_buffer *txb = tp->tx_buffers[i];
60154 +               int copied;
60155 +
60156 +               if (NULL == txb) {
60157 +                       DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
60158 +                               "Not transmitting anything!\n", i);
60159 +                       spin_unlock_irq(&tp->tx_lock);
60160 +                       return -EFAULT;
60161 +               }
60162 +               copied = tx_buffer_copy(txb, &buf[offset], count,
60163 +                                       isuserbuffer);
60164 +               if (copied < 0) {
60165 +                       /* An error occurred */
60166 +                       spin_unlock_irq(&tp->tx_lock);
60167 +                       return copied;
60168 +               }
60169 +               count -= copied;
60170 +               offset += copied;
60171 +
60172 +               tx = &tp->tx->ring[i].req;
60173 +
60174 +               tx->addr = virt_to_machine(txb->data);
60175 +               tx->size = txb->len;
60176 +
60177 +               DPRINTK("First 4 characters sent by TPM-FE are 0x%02x 0x%02x 0x%02x 0x%02x\n",
60178 +                       txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
60179 +
60180 +               /* get the granttable reference for this page */
60181 +               tx->ref = gnttab_claim_grant_reference(&gref_head);
60182 +
60183 +               if (-ENOSPC == tx->ref) {
60184 +                       spin_unlock_irq(&tp->tx_lock);
60185 +                       DPRINTK(" Grant table claim reference failed in func:%s line:%d file:%s\n", __FUNCTION__, __LINE__, __FILE__);
60186 +                       return -ENOSPC;
60187 +               }
60188 +               gnttab_grant_foreign_access_ref( tx->ref,
60189 +                                                tp->backend_id,
60190 +                                                (tx->addr >> PAGE_SHIFT),
60191 +                                                0 /*RW*/);
60192 +               wmb();
60193 +       }
60194 +
60195 +       atomic_set(&tp->tx_busy, 1);
60196 +       tp->tx_remember = remember;
60197 +       mb();
60198 +
60199 +       DPRINTK("Notifying backend via event channel %d\n",
60200 +               tp->evtchn);
60201 +
60202 +       notify_remote_via_irq(tp->irq);
60203 +
60204 +       spin_unlock_irq(&tp->tx_lock);
60205 +       return offset;
60206 +}
60207 +
60208 +
60209 +static void tpmif_notify_upperlayer(struct tpm_private *tp)
60210 +{
60211 +       /*
60212 +        * Notify upper layer about the state of the connection
60213 +        * to the BE.
60214 +        */
60215 +       mutex_lock(&upperlayer_lock);
60216 +
60217 +       if (upperlayer_tpmfe != NULL) {
60218 +               if (tp->is_connected) {
60219 +                       upperlayer_tpmfe->status(TPMFE_STATUS_CONNECTED);
60220 +               } else {
60221 +                       upperlayer_tpmfe->status(0);
60222 +               }
60223 +       }
60224 +       mutex_unlock(&upperlayer_lock);
60225 +}
60226 +
60227 +
60228 +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
60229 +{
60230 +       /*
60231 +        * Don't notify upper layer if we are in suspend mode and
60232 +        * should disconnect - assumption is that we will resume
60233 +        * The mutex keeps apps from sending.
60234 +        */
60235 +       if (is_connected == 0 && tp->is_suspended == 1) {
60236 +               return;
60237 +       }
60238 +
60239 +       /*
60240 +        * Unlock the mutex if we are connected again
60241 +        * after being suspended - now resuming.
60242 +        * This also removes the suspend state.
60243 +        */
60244 +       if (is_connected == 1 && tp->is_suspended == 1) {
60245 +               tp->is_suspended = 0;
60246 +               /* unlock, so apps can resume sending */
60247 +               mutex_unlock(&suspend_lock);
60248 +       }
60249 +
60250 +       if (is_connected != tp->is_connected) {
60251 +               tp->is_connected = is_connected;
60252 +               tpmif_notify_upperlayer(tp);
60253 +       }
60254 +}
60255 +
60256 +
60257 +/* =================================================================
60258 + * Initialization function.
60259 + * =================================================================
60260 + */
60261 +
60262 +static int __init
60263 +tpmif_init(void)
60264 +{
60265 +       IPRINTK("Initialising the vTPM driver.\n");
60266 +       if ( gnttab_alloc_grant_references ( TPMIF_TX_RING_SIZE,
60267 +                                            &gref_head ) < 0) {
60268 +               return -EFAULT;
60269 +       }
60270 +
60271 +       init_tpm_xenbus();
60272 +
60273 +       return 0;
60274 +}
60275 +
60276 +module_init(tpmif_init);
60277 +
60278 +static void __exit
60279 +tpmif_exit(void)
60280 +{
60281 +       exit_tpm_xenbus();
60282 +       gnttab_free_grant_references(gref_head);
60283 +}
60284 +
60285 +module_exit(tpmif_exit);
60286 +
60287 +MODULE_LICENSE("Dual BSD/GPL");
60288 +
60289 +/*
60290 + * Local variables:
60291 + *  c-file-style: "linux"
60292 + *  indent-tabs-mode: t
60293 + *  c-indent-level: 8
60294 + *  c-basic-offset: 8
60295 + *  tab-width: 8
60296 + * End:
60297 + */
60298 diff -Nurp ref-linux-2.6.16.9/drivers/xen/tpmfront/tpmfront.h tmp-linux-2.6-xen.patch/drivers/xen/tpmfront/tpmfront.h
60299 --- ref-linux-2.6.16.9/drivers/xen/tpmfront/tpmfront.h  1970-01-01 01:00:00.000000000 +0100
60300 +++ tmp-linux-2.6-xen.patch/drivers/xen/tpmfront/tpmfront.h     2006-04-10 00:05:52.000000000 +0200
60301 @@ -0,0 +1,40 @@
60302 +#ifndef TPM_FRONT_H
60303 +#define TPM_FRONT_H
60304 +
60305 +struct tpm_private {
60306 +       tpmif_tx_interface_t *tx;
60307 +       unsigned int evtchn;
60308 +       unsigned int irq;
60309 +       u8 is_connected;
60310 +       u8 is_suspended;
60311 +
60312 +       spinlock_t tx_lock;
60313 +
60314 +       struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
60315 +
60316 +       atomic_t tx_busy;
60317 +       void *tx_remember;
60318 +       domid_t backend_id;
60319 +       wait_queue_head_t wait_q;
60320 +
60321 +       struct xenbus_device *dev;
60322 +       int ring_ref;
60323 +};
60324 +
60325 +struct tx_buffer {
60326 +       unsigned int size;      // available space in data
60327 +       unsigned int len;       // used space in data
60328 +       unsigned char *data;    // pointer to a page
60329 +};
60330 +
60331 +#endif
60332 +
60333 +/*
60334 + * Local variables:
60335 + *  c-file-style: "linux"
60336 + *  indent-tabs-mode: t
60337 + *  c-indent-level: 8
60338 + *  c-basic-offset: 8
60339 + *  tab-width: 8
60340 + * End:
60341 + */
60342 diff -Nurp ref-linux-2.6.16.9/drivers/xen/util.c tmp-linux-2.6-xen.patch/drivers/xen/util.c
60343 --- ref-linux-2.6.16.9/drivers/xen/util.c       1970-01-01 01:00:00.000000000 +0100
60344 +++ tmp-linux-2.6-xen.patch/drivers/xen/util.c  2006-04-10 00:05:52.000000000 +0200
60345 @@ -0,0 +1,80 @@
60346 +#include <linux/config.h>
60347 +#include <linux/mm.h>
60348 +#include <linux/module.h>
60349 +#include <linux/slab.h>
60350 +#include <linux/vmalloc.h>
60351 +#include <asm/uaccess.h>
60352 +#include <xen/driver_util.h>
60353 +
60354 +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
60355 +{
60356 +       /* apply_to_page_range() does all the hard work. */
60357 +       return 0;
60358 +}
60359 +
60360 +struct vm_struct *alloc_vm_area(unsigned long size)
60361 +{
60362 +       struct vm_struct *area;
60363 +
60364 +       area = get_vm_area(size, VM_IOREMAP);
60365 +       if (area == NULL)
60366 +               return NULL;
60367 +
60368 +       /*
60369 +        * This ensures that page tables are constructed for this region
60370 +        * of kernel virtual address space and mapped into init_mm.
60371 +        */
60372 +       if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
60373 +                               area->size, f, NULL)) {
60374 +               free_vm_area(area);
60375 +               return NULL;
60376 +       }
60377 +
60378 +       return area;
60379 +}
60380 +EXPORT_SYMBOL_GPL(alloc_vm_area);
60381 +
60382 +void free_vm_area(struct vm_struct *area)
60383 +{
60384 +       struct vm_struct *ret;
60385 +       ret = remove_vm_area(area->addr);
60386 +       BUG_ON(ret != area);
60387 +       kfree(area);
60388 +}
60389 +EXPORT_SYMBOL_GPL(free_vm_area);
60390 +
60391 +void lock_vm_area(struct vm_struct *area)
60392 +{
60393 +       unsigned long i;
60394 +       char c;
60395 +
60396 +       /*
60397 +        * Prevent context switch to a lazy mm that doesn't have this area
60398 +        * mapped into its page tables.
60399 +        */
60400 +       preempt_disable();
60401 +
60402 +       /*
60403 +        * Ensure that the page tables are mapped into the current mm. The
60404 +        * page-fault path will copy the page directory pointers from init_mm.
60405 +        */
60406 +       for (i = 0; i < area->size; i += PAGE_SIZE)
60407 +               (void)__get_user(c, (char __user *)area->addr + i);
60408 +}
60409 +EXPORT_SYMBOL_GPL(lock_vm_area);
60410 +
60411 +void unlock_vm_area(struct vm_struct *area)
60412 +{
60413 +       preempt_enable();
60414 +}
60415 +EXPORT_SYMBOL_GPL(unlock_vm_area);
60416 +
60417 +/*
60418 + * Local variables:
60419 + *  c-file-style: "linux"
60420 + *  indent-tabs-mode: t
60421 + *  c-indent-level: 8
60422 + *  c-basic-offset: 8
60423 + *  tab-width: 8
60424 + * End:
60425 + */
60426 diff -Nurp ref-linux-2.6.16.9/drivers/xen/xenbus/Makefile tmp-linux-2.6-xen.patch/drivers/xen/xenbus/Makefile
60427 --- ref-linux-2.6.16.9/drivers/xen/xenbus/Makefile      1970-01-01 01:00:00.000000000 +0100
60428 +++ tmp-linux-2.6-xen.patch/drivers/xen/xenbus/Makefile 2006-04-10 00:05:52.000000000 +0200
60429 @@ -0,0 +1,8 @@
60430 +obj-y  += xenbus.o
60431 +
60432 +xenbus-objs =
60433 +xenbus-objs += xenbus_client.o
60434 +xenbus-objs += xenbus_comms.o
60435 +xenbus-objs += xenbus_xs.o
60436 +xenbus-objs += xenbus_probe.o
60437 +xenbus-objs += xenbus_dev.o
60438 diff -Nurp ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_client.c tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_client.c
60439 --- ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_client.c       1970-01-01 01:00:00.000000000 +0100
60440 +++ tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_client.c  2006-04-10 00:05:52.000000000 +0200
60441 @@ -0,0 +1,412 @@
60442 +/******************************************************************************
60443 + * Client-facing interface for the Xenbus driver.  In other words, the
60444 + * interface between the Xenbus and the device-specific code, be it the
60445 + * frontend or the backend of that driver.
60446 + *
60447 + * Copyright (C) 2005 XenSource Ltd
60448 + * 
60449 + * This program is free software; you can redistribute it and/or
60450 + * modify it under the terms of the GNU General Public License version 2
60451 + * as published by the Free Software Foundation; or, when distributed
60452 + * separately from the Linux kernel or incorporated into other
60453 + * software packages, subject to the following license:
60454 + * 
60455 + * Permission is hereby granted, free of charge, to any person obtaining a copy
60456 + * of this source file (the "Software"), to deal in the Software without
60457 + * restriction, including without limitation the rights to use, copy, modify,
60458 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60459 + * and to permit persons to whom the Software is furnished to do so, subject to
60460 + * the following conditions:
60461 + * 
60462 + * The above copyright notice and this permission notice shall be included in
60463 + * all copies or substantial portions of the Software.
60464 + * 
60465 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60466 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60467 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60468 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60469 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60470 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60471 + * IN THE SOFTWARE.
60472 + */
60473 +
60474 +#include <xen/evtchn.h>
60475 +#include <xen/gnttab.h>
60476 +#include <xen/xenbus.h>
60477 +#include <xen/driver_util.h>
60478 +
60479 +/* xenbus_probe.c */
60480 +extern char *kasprintf(const char *fmt, ...);
60481 +
60482 +#define DPRINTK(fmt, args...) \
60483 +    pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
60484 +
60485 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
60486 +                     struct xenbus_watch *watch,
60487 +                     void (*callback)(struct xenbus_watch *,
60488 +                                      const char **, unsigned int))
60489 +{
60490 +       int err;
60491 +
60492 +       watch->node = path;
60493 +       watch->callback = callback;
60494 +
60495 +       err = register_xenbus_watch(watch);
60496 +
60497 +       if (err) {
60498 +               watch->node = NULL;
60499 +               watch->callback = NULL;
60500 +               xenbus_dev_fatal(dev, err, "adding watch on %s", path);
60501 +       }
60502 +
60503 +       return err;
60504 +}
60505 +EXPORT_SYMBOL_GPL(xenbus_watch_path);
60506 +
60507 +
60508 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
60509 +                      const char *path2, struct xenbus_watch *watch,
60510 +                      void (*callback)(struct xenbus_watch *,
60511 +                                       const char **, unsigned int))
60512 +{
60513 +       int err;
60514 +       char *state = kasprintf("%s/%s", path, path2);
60515 +       if (!state) {
60516 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
60517 +               return -ENOMEM;
60518 +       }
60519 +       err = xenbus_watch_path(dev, state, watch, callback);
60520 +
60521 +       if (err)
60522 +               kfree(state);
60523 +       return err;
60524 +}
60525 +EXPORT_SYMBOL_GPL(xenbus_watch_path2);
60526 +
60527 +
60528 +int xenbus_switch_state(struct xenbus_device *dev, XenbusState state)
60529 +{
60530 +       /* We check whether the state is currently set to the given value, and
60531 +          if not, then the state is set.  We don't want to unconditionally
60532 +          write the given state, because we don't want to fire watches
60533 +          unnecessarily.  Furthermore, if the node has gone, we don't write
60534 +          to it, as the device will be tearing down, and we don't want to
60535 +          resurrect that directory.
60536 +
60537 +          Note that, because of this cached value of our state, this function
60538 +          will not work inside a Xenstore transaction (something it was
60539 +          trying to in the past) because dev->state would not get reset if
60540 +          the transaction was aborted.
60541 +
60542 +        */
60543 +
60544 +       int current_state;
60545 +       int err;
60546 +
60547 +       if (state == dev->state)
60548 +               return 0;
60549 +
60550 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "state", "%d",
60551 +                          &current_state);
60552 +       if (err != 1)
60553 +               return 0;
60554 +
60555 +       err = xenbus_printf(XBT_NULL, dev->nodename, "state", "%d", state);
60556 +       if (err) {
60557 +               if (state != XenbusStateClosing) /* Avoid looping */
60558 +                       xenbus_dev_fatal(dev, err, "writing new state");
60559 +               return err;
60560 +       }
60561 +
60562 +       dev->state = state;
60563 +
60564 +       return 0;
60565 +}
60566 +EXPORT_SYMBOL_GPL(xenbus_switch_state);
60567 +
60568 +
60569 +/**
60570 + * Return the path to the error node for the given device, or NULL on failure.
60571 + * If the value returned is non-NULL, then it is the caller's to kfree.
60572 + */
60573 +static char *error_path(struct xenbus_device *dev)
60574 +{
60575 +       return kasprintf("error/%s", dev->nodename);
60576 +}
60577 +
60578 +
60579 +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
60580 +               va_list ap)
60581 +{
60582 +       int ret;
60583 +       unsigned int len;
60584 +       char *printf_buffer = NULL, *path_buffer = NULL;
60585 +
60586 +#define PRINTF_BUFFER_SIZE 4096
60587 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
60588 +       if (printf_buffer == NULL)
60589 +               goto fail;
60590 +
60591 +       len = sprintf(printf_buffer, "%i ", -err);
60592 +       ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
60593 +
60594 +       BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
60595 +
60596 +       dev_err(&dev->dev, "%s\n", printf_buffer);
60597 +
60598 +       path_buffer = error_path(dev);
60599 +
60600 +       if (path_buffer == NULL) {
60601 +               printk("xenbus: failed to write error node for %s (%s)\n",
60602 +                      dev->nodename, printf_buffer);
60603 +               goto fail;
60604 +       }
60605 +
60606 +       if (xenbus_write(XBT_NULL, path_buffer, "error", printf_buffer) != 0) {
60607 +               printk("xenbus: failed to write error node for %s (%s)\n",
60608 +                      dev->nodename, printf_buffer);
60609 +               goto fail;
60610 +       }
60611 +
60612 +fail:
60613 +       if (printf_buffer)
60614 +               kfree(printf_buffer);
60615 +       if (path_buffer)
60616 +               kfree(path_buffer);
60617 +}
60618 +
60619 +
60620 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
60621 +                     ...)
60622 +{
60623 +       va_list ap;
60624 +
60625 +       va_start(ap, fmt);
60626 +       _dev_error(dev, err, fmt, ap);
60627 +       va_end(ap);
60628 +}
60629 +EXPORT_SYMBOL_GPL(xenbus_dev_error);
60630 +
60631 +
60632 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
60633 +                     ...)
60634 +{
60635 +       va_list ap;
60636 +
60637 +       va_start(ap, fmt);
60638 +       _dev_error(dev, err, fmt, ap);
60639 +       va_end(ap);
60640 +
60641 +       xenbus_switch_state(dev, XenbusStateClosing);
60642 +}
60643 +EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
60644 +
60645 +
60646 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
60647 +{
60648 +       int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
60649 +       if (err < 0)
60650 +               xenbus_dev_fatal(dev, err, "granting access to ring page");
60651 +       return err;
60652 +}
60653 +EXPORT_SYMBOL_GPL(xenbus_grant_ring);
60654 +
60655 +
60656 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
60657 +{
60658 +       evtchn_op_t op = {
60659 +               .cmd = EVTCHNOP_alloc_unbound,
60660 +               .u.alloc_unbound.dom = DOMID_SELF,
60661 +               .u.alloc_unbound.remote_dom = dev->otherend_id
60662 +       };
60663 +       int err = HYPERVISOR_event_channel_op(&op);
60664 +       if (err)
60665 +               xenbus_dev_fatal(dev, err, "allocating event channel");
60666 +       else
60667 +               *port = op.u.alloc_unbound.port;
60668 +       return err;
60669 +}
60670 +EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
60671 +
60672 +
60673 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
60674 +{
60675 +       evtchn_op_t op = {
60676 +               .cmd = EVTCHNOP_bind_interdomain,
60677 +               .u.bind_interdomain.remote_dom = dev->otherend_id,
60678 +               .u.bind_interdomain.remote_port = remote_port,
60679 +       };
60680 +       int err = HYPERVISOR_event_channel_op(&op);
60681 +       if (err)
60682 +               xenbus_dev_fatal(dev, err,
60683 +                                "binding to event channel %d from domain %d",
60684 +                                remote_port, dev->otherend_id);
60685 +       else
60686 +               *port = op.u.bind_interdomain.local_port;
60687 +       return err;
60688 +}
60689 +EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
60690 +
60691 +
60692 +int xenbus_free_evtchn(struct xenbus_device *dev, int port)
60693 +{
60694 +       evtchn_op_t op = {
60695 +               .cmd = EVTCHNOP_close,
60696 +               .u.close.port = port,
60697 +       };
60698 +       int err = HYPERVISOR_event_channel_op(&op);
60699 +       if (err)
60700 +               xenbus_dev_error(dev, err, "freeing event channel %d", port);
60701 +       return err;
60702 +}
60703 +
60704 +
60705 +/* Based on Rusty Russell's skeleton driver's map_page */
60706 +int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
60707 +{
60708 +       struct gnttab_map_grant_ref op = {
60709 +               .flags = GNTMAP_host_map,
60710 +               .ref   = gnt_ref,
60711 +               .dom   = dev->otherend_id,
60712 +       };
60713 +       struct vm_struct *area;
60714 +
60715 +       *vaddr = NULL;
60716 +
60717 +       area = alloc_vm_area(PAGE_SIZE);
60718 +       if (!area)
60719 +               return -ENOMEM;
60720 +
60721 +       op.host_addr = (unsigned long)area->addr;
60722 +
60723 +       lock_vm_area(area);
60724 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
60725 +       unlock_vm_area(area);
60726 +
60727 +       if (op.status != GNTST_okay) {
60728 +               free_vm_area(area);
60729 +               xenbus_dev_fatal(dev, op.status,
60730 +                                "mapping in shared page %d from domain %d",
60731 +                                gnt_ref, dev->otherend_id);
60732 +               return op.status;
60733 +       }
60734 +
60735 +       /* Stuff the handle in an unused field */
60736 +       area->phys_addr = (unsigned long)op.handle;
60737 +
60738 +       *vaddr = area->addr;
60739 +       return 0;
60740 +}
60741 +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
60742 +
60743 +
60744 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
60745 +                  grant_handle_t *handle, void *vaddr)
60746 +{
60747 +       struct gnttab_map_grant_ref op = {
60748 +               .host_addr = (unsigned long)vaddr,
60749 +               .flags     = GNTMAP_host_map,
60750 +               .ref       = gnt_ref,
60751 +               .dom       = dev->otherend_id,
60752 +       };
60753 +
60754 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
60755 +
60756 +       if (op.status != GNTST_okay) {
60757 +               xenbus_dev_fatal(dev, op.status,
60758 +                                "mapping in shared page %d from domain %d",
60759 +                                gnt_ref, dev->otherend_id);
60760 +       } else
60761 +               *handle = op.handle;
60762 +
60763 +       return op.status;
60764 +}
60765 +EXPORT_SYMBOL_GPL(xenbus_map_ring);
60766 +
60767 +
60768 +/* Based on Rusty Russell's skeleton driver's unmap_page */
60769 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
60770 +{
60771 +       struct vm_struct *area;
60772 +       struct gnttab_unmap_grant_ref op = {
60773 +               .host_addr = (unsigned long)vaddr,
60774 +       };
60775 +
60776 +       /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
60777 +        * method so that we don't have to muck with vmalloc internals here.
60778 +        * We could force the user to hang on to their struct vm_struct from
60779 +        * xenbus_map_ring_valloc, but these 6 lines considerably simplify
60780 +        * this API.
60781 +        */
60782 +       read_lock(&vmlist_lock);
60783 +       for (area = vmlist; area != NULL; area = area->next) {
60784 +               if (area->addr == vaddr)
60785 +                       break;
60786 +       }
60787 +       read_unlock(&vmlist_lock);
60788 +
60789 +       if (!area) {
60790 +               xenbus_dev_error(dev, -ENOENT,
60791 +                                "can't find mapped virtual address %p", vaddr);
60792 +               return GNTST_bad_virt_addr;
60793 +       }
60794 +
60795 +       op.handle = (grant_handle_t)area->phys_addr;
60796 +
60797 +       lock_vm_area(area);
60798 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
60799 +       unlock_vm_area(area);
60800 +
60801 +       if (op.status == GNTST_okay)
60802 +               free_vm_area(area);
60803 +       else
60804 +               xenbus_dev_error(dev, op.status,
60805 +                                "unmapping page at handle %d error %d",
60806 +                                (int16_t)area->phys_addr, op.status);
60807 +
60808 +       return op.status;
60809 +}
60810 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
60811 +
60812 +
60813 +int xenbus_unmap_ring(struct xenbus_device *dev,
60814 +                    grant_handle_t handle, void *vaddr)
60815 +{
60816 +       struct gnttab_unmap_grant_ref op = {
60817 +               .host_addr = (unsigned long)vaddr,
60818 +               .handle    = handle,
60819 +       };
60820 +
60821 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
60822 +
60823 +       if (op.status != GNTST_okay)
60824 +               xenbus_dev_error(dev, op.status,
60825 +                                "unmapping page at handle %d error %d",
60826 +                                handle, op.status);
60827 +
60828 +       return op.status;
60829 +}
60830 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
60831 +
60832 +
60833 +XenbusState xenbus_read_driver_state(const char *path)
60834 +{
60835 +       XenbusState result;
60836 +       int err = xenbus_gather(XBT_NULL, path, "state", "%d", &result, NULL);
60837 +       if (err)
60838 +               result = XenbusStateClosed;
60839 +
60840 +       return result;
60841 +}
60842 +EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
60843 +
60844 +
60845 +/*
60846 + * Local variables:
60847 + *  c-file-style: "linux"
60848 + *  indent-tabs-mode: t
60849 + *  c-indent-level: 8
60850 + *  c-basic-offset: 8
60851 + *  tab-width: 8
60852 + * End:
60853 + */
60854 diff -Nurp ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_comms.c tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_comms.c
60855 --- ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_comms.c        1970-01-01 01:00:00.000000000 +0100
60856 +++ tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_comms.c   2006-04-10 00:05:52.000000000 +0200
60857 @@ -0,0 +1,218 @@
60858 +/******************************************************************************
60859 + * xenbus_comms.c
60860 + *
60861 + * Low level code to talks to Xen Store: ringbuffer and event channel.
60862 + *
60863 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
60864 + * 
60865 + * This program is free software; you can redistribute it and/or
60866 + * modify it under the terms of the GNU General Public License version 2
60867 + * as published by the Free Software Foundation; or, when distributed
60868 + * separately from the Linux kernel or incorporated into other
60869 + * software packages, subject to the following license:
60870 + * 
60871 + * Permission is hereby granted, free of charge, to any person obtaining a copy
60872 + * of this source file (the "Software"), to deal in the Software without
60873 + * restriction, including without limitation the rights to use, copy, modify,
60874 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60875 + * and to permit persons to whom the Software is furnished to do so, subject to
60876 + * the following conditions:
60877 + * 
60878 + * The above copyright notice and this permission notice shall be included in
60879 + * all copies or substantial portions of the Software.
60880 + * 
60881 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60882 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60883 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60884 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60885 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60886 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60887 + * IN THE SOFTWARE.
60888 + */
60889 +
60890 +#include <asm/hypervisor.h>
60891 +#include <xen/evtchn.h>
60892 +#include <linux/wait.h>
60893 +#include <linux/interrupt.h>
60894 +#include <linux/sched.h>
60895 +#include <linux/err.h>
60896 +#include <xen/xenbus.h>
60897 +#include "xenbus_comms.h"
60898 +
60899 +static int xenbus_irq;
60900 +
60901 +extern void xenbus_probe(void *);
60902 +extern int xenstored_ready;
60903 +static DECLARE_WORK(probe_work, xenbus_probe, NULL);
60904 +
60905 +DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
60906 +
60907 +static inline struct xenstore_domain_interface *xenstore_domain_interface(void)
60908 +{
60909 +       return mfn_to_virt(xen_start_info->store_mfn);
60910 +}
60911 +
60912 +static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
60913 +{
60914 +       if (unlikely(xenstored_ready == 0)) {
60915 +               xenstored_ready = 1;
60916 +               schedule_work(&probe_work);
60917 +       }
60918 +
60919 +       wake_up(&xb_waitq);
60920 +       return IRQ_HANDLED;
60921 +}
60922 +
60923 +static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
60924 +{
60925 +       return ((prod - cons) <= XENSTORE_RING_SIZE);
60926 +}
60927 +
60928 +static void *get_output_chunk(XENSTORE_RING_IDX cons,
60929 +                             XENSTORE_RING_IDX prod,
60930 +                             char *buf, uint32_t *len)
60931 +{
60932 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
60933 +       if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
60934 +               *len = XENSTORE_RING_SIZE - (prod - cons);
60935 +       return buf + MASK_XENSTORE_IDX(prod);
60936 +}
60937 +
60938 +static const void *get_input_chunk(XENSTORE_RING_IDX cons,
60939 +                                  XENSTORE_RING_IDX prod,
60940 +                                  const char *buf, uint32_t *len)
60941 +{
60942 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
60943 +       if ((prod - cons) < *len)
60944 +               *len = prod - cons;
60945 +       return buf + MASK_XENSTORE_IDX(cons);
60946 +}
60947 +
60948 +int xb_write(const void *data, unsigned len)
60949 +{
60950 +       struct xenstore_domain_interface *intf = xenstore_domain_interface();
60951 +       XENSTORE_RING_IDX cons, prod;
60952 +       int rc;
60953 +
60954 +       while (len != 0) {
60955 +               void *dst;
60956 +               unsigned int avail;
60957 +
60958 +               rc = wait_event_interruptible(
60959 +                       xb_waitq,
60960 +                       (intf->req_prod - intf->req_cons) !=
60961 +                       XENSTORE_RING_SIZE);
60962 +               if (rc < 0)
60963 +                       return rc;
60964 +
60965 +               /* Read indexes, then verify. */
60966 +               cons = intf->req_cons;
60967 +               prod = intf->req_prod;
60968 +               mb();
60969 +               if (!check_indexes(cons, prod)) {
60970 +                       intf->req_cons = intf->req_prod = 0;
60971 +                       return -EIO;
60972 +               }
60973 +
60974 +               dst = get_output_chunk(cons, prod, intf->req, &avail);
60975 +               if (avail == 0)
60976 +                       continue;
60977 +               if (avail > len)
60978 +                       avail = len;
60979 +
60980 +               memcpy(dst, data, avail);
60981 +               data += avail;
60982 +               len -= avail;
60983 +
60984 +               /* Other side must not see new header until data is there. */
60985 +               wmb();
60986 +               intf->req_prod += avail;
60987 +
60988 +               /* This implies mb() before other side sees interrupt. */
60989 +               notify_remote_via_evtchn(xen_start_info->store_evtchn);
60990 +       }
60991 +
60992 +       return 0;
60993 +}
60994 +
60995 +int xb_read(void *data, unsigned len)
60996 +{
60997 +       struct xenstore_domain_interface *intf = xenstore_domain_interface();
60998 +       XENSTORE_RING_IDX cons, prod;
60999 +       int rc;
61000 +
61001 +       while (len != 0) {
61002 +               unsigned int avail;
61003 +               const char *src;
61004 +
61005 +               rc = wait_event_interruptible(
61006 +                       xb_waitq,
61007 +                       intf->rsp_cons != intf->rsp_prod);
61008 +               if (rc < 0)
61009 +                       return rc;
61010 +
61011 +               /* Read indexes, then verify. */
61012 +               cons = intf->rsp_cons;
61013 +               prod = intf->rsp_prod;
61014 +               mb();
61015 +               if (!check_indexes(cons, prod)) {
61016 +                       intf->rsp_cons = intf->rsp_prod = 0;
61017 +                       return -EIO;
61018 +               }
61019 +
61020 +               src = get_input_chunk(cons, prod, intf->rsp, &avail);
61021 +               if (avail == 0)
61022 +                       continue;
61023 +               if (avail > len)
61024 +                       avail = len;
61025 +
61026 +               /* We must read header before we read data. */
61027 +               rmb();
61028 +
61029 +               memcpy(data, src, avail);
61030 +               data += avail;
61031 +               len -= avail;
61032 +
61033 +               /* Other side must not see free space until we've copied out */
61034 +               mb();
61035 +               intf->rsp_cons += avail;
61036 +
61037 +               pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
61038 +
61039 +               /* Implies mb(): they will see new header. */
61040 +               notify_remote_via_evtchn(xen_start_info->store_evtchn);
61041 +       }
61042 +
61043 +       return 0;
61044 +}
61045 +
61046 +/* Set up interrupt handler off store event channel. */
61047 +int xb_init_comms(void)
61048 +{
61049 +       int err;
61050 +
61051 +       if (xenbus_irq)
61052 +               unbind_from_irqhandler(xenbus_irq, &xb_waitq);
61053 +
61054 +       err = bind_evtchn_to_irqhandler(
61055 +               xen_start_info->store_evtchn, wake_waiting,
61056 +               0, "xenbus", &xb_waitq);
61057 +       if (err <= 0) {
61058 +               printk(KERN_ERR "XENBUS request irq failed %i\n", err);
61059 +               return err;
61060 +       }
61061 +
61062 +       xenbus_irq = err;
61063 +
61064 +       return 0;
61065 +}
61066 +
61067 +/*
61068 + * Local variables:
61069 + *  c-file-style: "linux"
61070 + *  indent-tabs-mode: t
61071 + *  c-indent-level: 8
61072 + *  c-basic-offset: 8
61073 + *  tab-width: 8
61074 + * End:
61075 + */
61076 diff -Nurp ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_comms.h tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_comms.h
61077 --- ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_comms.h        1970-01-01 01:00:00.000000000 +0100
61078 +++ tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_comms.h   2006-04-10 00:05:52.000000000 +0200
61079 @@ -0,0 +1,53 @@
61080 +/*
61081 + * Private include for xenbus communications.
61082 + * 
61083 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
61084 + *
61085 + * This program is free software; you can redistribute it and/or
61086 + * modify it under the terms of the GNU General Public License version 2
61087 + * as published by the Free Software Foundation; or, when distributed
61088 + * separately from the Linux kernel or incorporated into other
61089 + * software packages, subject to the following license:
61090 + * 
61091 + * Permission is hereby granted, free of charge, to any person obtaining a copy
61092 + * of this source file (the "Software"), to deal in the Software without
61093 + * restriction, including without limitation the rights to use, copy, modify,
61094 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
61095 + * and to permit persons to whom the Software is furnished to do so, subject to
61096 + * the following conditions:
61097 + * 
61098 + * The above copyright notice and this permission notice shall be included in
61099 + * all copies or substantial portions of the Software.
61100 + * 
61101 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61102 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61103 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61104 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61105 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
61106 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
61107 + * IN THE SOFTWARE.
61108 + */
61109 +
61110 +#ifndef _XENBUS_COMMS_H
61111 +#define _XENBUS_COMMS_H
61112 +
61113 +int xs_init(void);
61114 +int xb_init_comms(void);
61115 +
61116 +/* Low level routines. */
61117 +int xb_write(const void *data, unsigned len);
61118 +int xb_read(void *data, unsigned len);
61119 +int xs_input_avail(void);
61120 +extern wait_queue_head_t xb_waitq;
61121 +
61122 +#endif /* _XENBUS_COMMS_H */
61123 +
61124 +/*
61125 + * Local variables:
61126 + *  c-file-style: "linux"
61127 + *  indent-tabs-mode: t
61128 + *  c-indent-level: 8
61129 + *  c-basic-offset: 8
61130 + *  tab-width: 8
61131 + * End:
61132 + */
61133 diff -Nurp ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_dev.c tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_dev.c
61134 --- ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_dev.c  1970-01-01 01:00:00.000000000 +0100
61135 +++ tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_dev.c     2006-04-10 00:05:52.000000000 +0200
61136 @@ -0,0 +1,252 @@
61137 +/*
61138 + * xenbus_dev.c
61139 + * 
61140 + * Driver giving user-space access to the kernel's xenbus connection
61141 + * to xenstore.
61142 + * 
61143 + * Copyright (c) 2005, Christian Limpach
61144 + * Copyright (c) 2005, Rusty Russell, IBM Corporation
61145 + * 
61146 + * This program is free software; you can redistribute it and/or
61147 + * modify it under the terms of the GNU General Public License version 2
61148 + * as published by the Free Software Foundation; or, when distributed
61149 + * separately from the Linux kernel or incorporated into other
61150 + * software packages, subject to the following license:
61151 + * 
61152 + * Permission is hereby granted, free of charge, to any person obtaining a copy
61153 + * of this source file (the "Software"), to deal in the Software without
61154 + * restriction, including without limitation the rights to use, copy, modify,
61155 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
61156 + * and to permit persons to whom the Software is furnished to do so, subject to
61157 + * the following conditions:
61158 + * 
61159 + * The above copyright notice and this permission notice shall be included in
61160 + * all copies or substantial portions of the Software.
61161 + * 
61162 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61163 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61164 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61165 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61166 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
61167 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
61168 + * IN THE SOFTWARE.
61169 + */
61170 +
61171 +#include <linux/config.h>
61172 +#include <linux/kernel.h>
61173 +#include <linux/errno.h>
61174 +#include <linux/uio.h>
61175 +#include <linux/notifier.h>
61176 +#include <linux/wait.h>
61177 +#include <linux/fs.h>
61178 +#include <linux/poll.h>
61179 +
61180 +#include "xenbus_comms.h"
61181 +
61182 +#include <asm/uaccess.h>
61183 +#include <asm/hypervisor.h>
61184 +#include <xen/xenbus.h>
61185 +#include <xen/xen_proc.h>
61186 +#include <asm/hypervisor.h>
61187 +
61188 +struct xenbus_dev_transaction {
61189 +       struct list_head list;
61190 +       xenbus_transaction_t handle;
61191 +};
61192 +
61193 +struct xenbus_dev_data {
61194 +       /* In-progress transaction. */
61195 +       struct list_head transactions;
61196 +
61197 +       /* Partial request. */
61198 +       unsigned int len;
61199 +       union {
61200 +               struct xsd_sockmsg msg;
61201 +               char buffer[PAGE_SIZE];
61202 +       } u;
61203 +
61204 +       /* Response queue. */
61205 +#define MASK_READ_IDX(idx) ((idx)&(PAGE_SIZE-1))
61206 +       char read_buffer[PAGE_SIZE];
61207 +       unsigned int read_cons, read_prod;
61208 +       wait_queue_head_t read_waitq;
61209 +};
61210 +
61211 +static struct proc_dir_entry *xenbus_dev_intf;
61212 +
61213 +static ssize_t xenbus_dev_read(struct file *filp,
61214 +                              char __user *ubuf,
61215 +                              size_t len, loff_t *ppos)
61216 +{
61217 +       struct xenbus_dev_data *u = filp->private_data;
61218 +       int i;
61219 +
61220 +       if (wait_event_interruptible(u->read_waitq,
61221 +                                    u->read_prod != u->read_cons))
61222 +               return -EINTR;
61223 +
61224 +       for (i = 0; i < len; i++) {
61225 +               if (u->read_cons == u->read_prod)
61226 +                       break;
61227 +               put_user(u->read_buffer[MASK_READ_IDX(u->read_cons)], ubuf+i);
61228 +               u->read_cons++;
61229 +       }
61230 +
61231 +       return i;
61232 +}
61233 +
61234 +static void queue_reply(struct xenbus_dev_data *u,
61235 +                       char *data, unsigned int len)
61236 +{
61237 +       int i;
61238 +
61239 +       for (i = 0; i < len; i++, u->read_prod++)
61240 +               u->read_buffer[MASK_READ_IDX(u->read_prod)] = data[i];
61241 +
61242 +       BUG_ON((u->read_prod - u->read_cons) > sizeof(u->read_buffer));
61243 +
61244 +       wake_up(&u->read_waitq);
61245 +}
61246 +
61247 +static ssize_t xenbus_dev_write(struct file *filp,
61248 +                               const char __user *ubuf,
61249 +                               size_t len, loff_t *ppos)
61250 +{
61251 +       struct xenbus_dev_data *u = filp->private_data;
61252 +       struct xenbus_dev_transaction *trans = NULL;
61253 +       void *reply;
61254 +
61255 +       if ((len + u->len) > sizeof(u->u.buffer))
61256 +               return -EINVAL;
61257 +
61258 +       if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0)
61259 +               return -EFAULT;
61260 +
61261 +       u->len += len;
61262 +       if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
61263 +               return len;
61264 +
61265 +       switch (u->u.msg.type) {
61266 +       case XS_TRANSACTION_START:
61267 +       case XS_TRANSACTION_END:
61268 +       case XS_DIRECTORY:
61269 +       case XS_READ:
61270 +       case XS_GET_PERMS:
61271 +       case XS_RELEASE:
61272 +       case XS_GET_DOMAIN_PATH:
61273 +       case XS_WRITE:
61274 +       case XS_MKDIR:
61275 +       case XS_RM:
61276 +       case XS_SET_PERMS:
61277 +               if (u->u.msg.type == XS_TRANSACTION_START) {
61278 +                       trans = kmalloc(sizeof(*trans), GFP_KERNEL);
61279 +                       if (!trans)
61280 +                               return -ENOMEM;
61281 +               }
61282 +
61283 +               reply = xenbus_dev_request_and_reply(&u->u.msg);
61284 +               if (IS_ERR(reply)) {
61285 +                       kfree(trans);
61286 +                       return PTR_ERR(reply);
61287 +               }
61288 +
61289 +               if (u->u.msg.type == XS_TRANSACTION_START) {
61290 +                       trans->handle = simple_strtoul(reply, NULL, 0);
61291 +                       list_add(&trans->list, &u->transactions);
61292 +               } else if (u->u.msg.type == XS_TRANSACTION_END) {
61293 +                       list_for_each_entry(trans, &u->transactions, list)
61294 +                               if (trans->handle == u->u.msg.tx_id)
61295 +                                       break;
61296 +                       BUG_ON(&trans->list == &u->transactions);
61297 +                       list_del(&trans->list);
61298 +                       kfree(trans);
61299 +               }
61300 +               queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
61301 +               queue_reply(u, (char *)reply, u->u.msg.len);
61302 +               kfree(reply);
61303 +               break;
61304 +
61305 +       default:
61306 +               return -EINVAL;
61307 +       }
61308 +
61309 +       u->len = 0;
61310 +       return len;
61311 +}
61312 +
61313 +static int xenbus_dev_open(struct inode *inode, struct file *filp)
61314 +{
61315 +       struct xenbus_dev_data *u;
61316 +
61317 +       if (xen_start_info->store_evtchn == 0)
61318 +               return -ENOENT;
61319 +
61320 +       nonseekable_open(inode, filp);
61321 +
61322 +       u = kzalloc(sizeof(*u), GFP_KERNEL);
61323 +       if (u == NULL)
61324 +               return -ENOMEM;
61325 +
61326 +       INIT_LIST_HEAD(&u->transactions);
61327 +       init_waitqueue_head(&u->read_waitq);
61328 +
61329 +       filp->private_data = u;
61330 +
61331 +       return 0;
61332 +}
61333 +
61334 +static int xenbus_dev_release(struct inode *inode, struct file *filp)
61335 +{
61336 +       struct xenbus_dev_data *u = filp->private_data;
61337 +       struct xenbus_dev_transaction *trans, *tmp;
61338 +
61339 +       list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
61340 +               xenbus_transaction_end(trans->handle, 1);
61341 +               list_del(&trans->list);
61342 +               kfree(trans);
61343 +       }
61344 +
61345 +       kfree(u);
61346 +
61347 +       return 0;
61348 +}
61349 +
61350 +static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
61351 +{
61352 +       struct xenbus_dev_data *u = file->private_data;
61353 +
61354 +       poll_wait(file, &u->read_waitq, wait);
61355 +       if (u->read_cons != u->read_prod)
61356 +               return POLLIN | POLLRDNORM;
61357 +       return 0;
61358 +}
61359 +
61360 +static struct file_operations xenbus_dev_file_ops = {
61361 +       .read = xenbus_dev_read,
61362 +       .write = xenbus_dev_write,
61363 +       .open = xenbus_dev_open,
61364 +       .release = xenbus_dev_release,
61365 +       .poll = xenbus_dev_poll,
61366 +};
61367 +
61368 +static int __init
61369 +xenbus_dev_init(void)
61370 +{
61371 +       xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
61372 +       if (xenbus_dev_intf)
61373 +               xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
61374 +
61375 +       return 0;
61376 +}
61377 +
61378 +__initcall(xenbus_dev_init);
61379 +
61380 +/*
61381 + * Local variables:
61382 + *  c-file-style: "linux"
61383 + *  indent-tabs-mode: t
61384 + *  c-indent-level: 8
61385 + *  c-basic-offset: 8
61386 + *  tab-width: 8
61387 + * End:
61388 + */
61389 diff -Nurp ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_probe.c tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_probe.c
61390 --- ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_probe.c        1970-01-01 01:00:00.000000000 +0100
61391 +++ tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_probe.c   2006-04-10 00:05:52.000000000 +0200
61392 @@ -0,0 +1,1083 @@
61393 +/******************************************************************************
61394 + * Talks to Xen Store to figure out what devices we have.
61395 + *
61396 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
61397 + * Copyright (C) 2005 Mike Wray, Hewlett-Packard
61398 + * Copyright (C) 2005 XenSource Ltd
61399 + * 
61400 + * This program is free software; you can redistribute it and/or
61401 + * modify it under the terms of the GNU General Public License version 2
61402 + * as published by the Free Software Foundation; or, when distributed
61403 + * separately from the Linux kernel or incorporated into other
61404 + * software packages, subject to the following license:
61405 + * 
61406 + * Permission is hereby granted, free of charge, to any person obtaining a copy
61407 + * of this source file (the "Software"), to deal in the Software without
61408 + * restriction, including without limitation the rights to use, copy, modify,
61409 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
61410 + * and to permit persons to whom the Software is furnished to do so, subject to
61411 + * the following conditions:
61412 + * 
61413 + * The above copyright notice and this permission notice shall be included in
61414 + * all copies or substantial portions of the Software.
61415 + * 
61416 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61417 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61418 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61419 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61420 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
61421 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
61422 + * IN THE SOFTWARE.
61423 + */
61424 +
61425 +#define DPRINTK(fmt, args...) \
61426 +    pr_debug("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
61427 +
61428 +#include <linux/kernel.h>
61429 +#include <linux/err.h>
61430 +#include <linux/string.h>
61431 +#include <linux/ctype.h>
61432 +#include <linux/fcntl.h>
61433 +#include <linux/mm.h>
61434 +#include <linux/notifier.h>
61435 +#include <linux/kthread.h>
61436 +
61437 +#include <asm/io.h>
61438 +#include <asm/page.h>
61439 +#include <asm/pgtable.h>
61440 +#include <asm/hypervisor.h>
61441 +#include <xen/xenbus.h>
61442 +#include <xen/xen_proc.h>
61443 +#include <xen/evtchn.h>
61444 +#include <xen/features.h>
61445 +
61446 +#include "xenbus_comms.h"
61447 +
61448 +extern struct mutex xenwatch_mutex;
61449 +
61450 +static struct notifier_block *xenstore_chain;
61451 +
61452 +/* If something in array of ids matches this device, return it. */
61453 +static const struct xenbus_device_id *
61454 +match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
61455 +{
61456 +       for (; *arr->devicetype != '\0'; arr++) {
61457 +               if (!strcmp(arr->devicetype, dev->devicetype))
61458 +                       return arr;
61459 +       }
61460 +       return NULL;
61461 +}
61462 +
61463 +static int xenbus_match(struct device *_dev, struct device_driver *_drv)
61464 +{
61465 +       struct xenbus_driver *drv = to_xenbus_driver(_drv);
61466 +
61467 +       if (!drv->ids)
61468 +               return 0;
61469 +
61470 +       return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
61471 +}
61472 +
61473 +struct xen_bus_type
61474 +{
61475 +       char *root;
61476 +       unsigned int levels;
61477 +       int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
61478 +       int (*probe)(const char *type, const char *dir);
61479 +       struct bus_type bus;
61480 +       struct device dev;
61481 +};
61482 +
61483 +
61484 +/* device/<type>/<id> => <type>-<id> */
61485 +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
61486 +{
61487 +       nodename = strchr(nodename, '/');
61488 +       if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
61489 +               printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
61490 +               return -EINVAL;
61491 +       }
61492 +
61493 +       strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
61494 +       if (!strchr(bus_id, '/')) {
61495 +               printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
61496 +               return -EINVAL;
61497 +       }
61498 +       *strchr(bus_id, '/') = '-';
61499 +       return 0;
61500 +}
61501 +
61502 +
61503 +static void free_otherend_details(struct xenbus_device *dev)
61504 +{
61505 +       kfree(dev->otherend);
61506 +       dev->otherend = NULL;
61507 +}
61508 +
61509 +
61510 +static void free_otherend_watch(struct xenbus_device *dev)
61511 +{
61512 +       if (dev->otherend_watch.node) {
61513 +               unregister_xenbus_watch(&dev->otherend_watch);
61514 +               kfree(dev->otherend_watch.node);
61515 +               dev->otherend_watch.node = NULL;
61516 +       }
61517 +}
61518 +
61519 +
61520 +static int read_otherend_details(struct xenbus_device *xendev,
61521 +                                char *id_node, char *path_node)
61522 +{
61523 +       int err = xenbus_gather(XBT_NULL, xendev->nodename,
61524 +                               id_node, "%i", &xendev->otherend_id,
61525 +                               path_node, NULL, &xendev->otherend,
61526 +                               NULL);
61527 +       if (err) {
61528 +               xenbus_dev_fatal(xendev, err,
61529 +                                "reading other end details from %s",
61530 +                                xendev->nodename);
61531 +               return err;
61532 +       }
61533 +       if (strlen(xendev->otherend) == 0 ||
61534 +           !xenbus_exists(XBT_NULL, xendev->otherend, "")) {
61535 +               xenbus_dev_fatal(xendev, -ENOENT, "missing other end from %s",
61536 +                                xendev->nodename);
61537 +               free_otherend_details(xendev);
61538 +               return -ENOENT;
61539 +       }
61540 +
61541 +       return 0;
61542 +}
61543 +
61544 +
61545 +static int read_backend_details(struct xenbus_device *xendev)
61546 +{
61547 +       return read_otherend_details(xendev, "backend-id", "backend");
61548 +}
61549 +
61550 +
61551 +static int read_frontend_details(struct xenbus_device *xendev)
61552 +{
61553 +       return read_otherend_details(xendev, "frontend-id", "frontend");
61554 +}
61555 +
61556 +
61557 +/* Bus type for frontend drivers. */
61558 +static int xenbus_probe_frontend(const char *type, const char *name);
61559 +static struct xen_bus_type xenbus_frontend = {
61560 +       .root = "device",
61561 +       .levels = 2,            /* device/type/<id> */
61562 +       .get_bus_id = frontend_bus_id,
61563 +       .probe = xenbus_probe_frontend,
61564 +       .bus = {
61565 +               .name  = "xen",
61566 +               .match = xenbus_match,
61567 +       },
61568 +       .dev = {
61569 +               .bus_id = "xen",
61570 +       },
61571 +};
61572 +
61573 +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
61574 +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
61575 +{
61576 +       int domid, err;
61577 +       const char *devid, *type, *frontend;
61578 +       unsigned int typelen;
61579 +
61580 +       type = strchr(nodename, '/');
61581 +       if (!type)
61582 +               return -EINVAL;
61583 +       type++;
61584 +       typelen = strcspn(type, "/");
61585 +       if (!typelen || type[typelen] != '/')
61586 +               return -EINVAL;
61587 +
61588 +       devid = strrchr(nodename, '/') + 1;
61589 +
61590 +       err = xenbus_gather(XBT_NULL, nodename, "frontend-id", "%i", &domid,
61591 +                           "frontend", NULL, &frontend,
61592 +                           NULL);
61593 +       if (err)
61594 +               return err;
61595 +       if (strlen(frontend) == 0)
61596 +               err = -ERANGE;
61597 +       if (!err && !xenbus_exists(XBT_NULL, frontend, ""))
61598 +               err = -ENOENT;
61599 +
61600 +       kfree(frontend);
61601 +
61602 +       if (err)
61603 +               return err;
61604 +
61605 +       if (snprintf(bus_id, BUS_ID_SIZE,
61606 +                    "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
61607 +               return -ENOSPC;
61608 +       return 0;
61609 +}
61610 +
61611 +static int xenbus_uevent_backend(struct device *dev, char **envp,
61612 +                                int num_envp, char *buffer, int buffer_size);
61613 +static int xenbus_probe_backend(const char *type, const char *domid);
61614 +static struct xen_bus_type xenbus_backend = {
61615 +       .root = "backend",
61616 +       .levels = 3,            /* backend/type/<frontend>/<id> */
61617 +       .get_bus_id = backend_bus_id,
61618 +       .probe = xenbus_probe_backend,
61619 +       .bus = {
61620 +               .name  = "xen-backend",
61621 +               .match = xenbus_match,
61622 +               .uevent = xenbus_uevent_backend,
61623 +       },
61624 +       .dev = {
61625 +               .bus_id = "xen-backend",
61626 +       },
61627 +};
61628 +
61629 +static int xenbus_uevent_backend(struct device *dev, char **envp,
61630 +                                int num_envp, char *buffer, int buffer_size)
61631 +{
61632 +       struct xenbus_device *xdev;
61633 +       struct xenbus_driver *drv;
61634 +       int i = 0;
61635 +       int length = 0;
61636 +
61637 +       DPRINTK("");
61638 +
61639 +       if (dev == NULL)
61640 +               return -ENODEV;
61641 +
61642 +       xdev = to_xenbus_device(dev);
61643 +       if (xdev == NULL)
61644 +               return -ENODEV;
61645 +
61646 +       /* stuff we want to pass to /sbin/hotplug */
61647 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
61648 +                      "XENBUS_TYPE=%s", xdev->devicetype);
61649 +
61650 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
61651 +                      "XENBUS_PATH=%s", xdev->nodename);
61652 +
61653 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
61654 +                      "XENBUS_BASE_PATH=%s", xenbus_backend.root);
61655 +
61656 +       /* terminate, set to next free slot, shrink available space */
61657 +       envp[i] = NULL;
61658 +       envp = &envp[i];
61659 +       num_envp -= i;
61660 +       buffer = &buffer[length];
61661 +       buffer_size -= length;
61662 +
61663 +       if (dev->driver) {
61664 +               drv = to_xenbus_driver(dev->driver);
61665 +               if (drv && drv->uevent)
61666 +                       return drv->uevent(xdev, envp, num_envp, buffer,
61667 +                                          buffer_size);
61668 +       }
61669 +
61670 +       return 0;
61671 +}
61672 +
61673 +static void otherend_changed(struct xenbus_watch *watch,
61674 +                            const char **vec, unsigned int len)
61675 +{
61676 +       struct xenbus_device *dev =
61677 +               container_of(watch, struct xenbus_device, otherend_watch);
61678 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
61679 +       XenbusState state;
61680 +
61681 +       /* Protect us against watches firing on old details when the otherend
61682 +          details change, say immediately after a resume. */
61683 +       if (!dev->otherend ||
61684 +           strncmp(dev->otherend, vec[XS_WATCH_PATH],
61685 +                   strlen(dev->otherend))) {
61686 +               DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]);
61687 +               return;
61688 +       }
61689 +
61690 +       state = xenbus_read_driver_state(dev->otherend);
61691 +
61692 +       DPRINTK("state is %d, %s, %s",
61693 +               state, dev->otherend_watch.node, vec[XS_WATCH_PATH]);
61694 +       if (drv->otherend_changed)
61695 +               drv->otherend_changed(dev, state);
61696 +}
61697 +
61698 +
61699 +static int talk_to_otherend(struct xenbus_device *dev)
61700 +{
61701 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
61702 +
61703 +       free_otherend_watch(dev);
61704 +       free_otherend_details(dev);
61705 +
61706 +       return drv->read_otherend_details(dev);
61707 +}
61708 +
61709 +
61710 +static int watch_otherend(struct xenbus_device *dev)
61711 +{
61712 +       return xenbus_watch_path2(dev, dev->otherend, "state",
61713 +                                 &dev->otherend_watch, otherend_changed);
61714 +}
61715 +
61716 +
61717 +static int xenbus_dev_probe(struct device *_dev)
61718 +{
61719 +       struct xenbus_device *dev = to_xenbus_device(_dev);
61720 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
61721 +       const struct xenbus_device_id *id;
61722 +       int err;
61723 +
61724 +       DPRINTK("");
61725 +
61726 +       if (!drv->probe) {
61727 +               err = -ENODEV;
61728 +               goto fail;
61729 +       }
61730 +
61731 +       id = match_device(drv->ids, dev);
61732 +       if (!id) {
61733 +               err = -ENODEV;
61734 +               goto fail;
61735 +       }
61736 +
61737 +       err = talk_to_otherend(dev);
61738 +       if (err) {
61739 +               printk(KERN_WARNING
61740 +                      "xenbus_probe: talk_to_otherend on %s failed.\n",
61741 +                      dev->nodename);
61742 +               return err;
61743 +       }
61744 +
61745 +       err = drv->probe(dev, id);
61746 +       if (err)
61747 +               goto fail;
61748 +
61749 +       err = watch_otherend(dev);
61750 +       if (err) {
61751 +               printk(KERN_WARNING
61752 +                      "xenbus_probe: watch_otherend on %s failed.\n",
61753 +                      dev->nodename);
61754 +               return err;
61755 +       }
61756 +
61757 +       return 0;
61758 +fail:
61759 +       xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
61760 +       xenbus_switch_state(dev, XenbusStateClosed);
61761 +       return -ENODEV;
61762 +}
61763 +
61764 +static int xenbus_dev_remove(struct device *_dev)
61765 +{
61766 +       struct xenbus_device *dev = to_xenbus_device(_dev);
61767 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
61768 +
61769 +       DPRINTK("");
61770 +
61771 +       free_otherend_watch(dev);
61772 +       free_otherend_details(dev);
61773 +
61774 +       if (drv->remove)
61775 +               drv->remove(dev);
61776 +
61777 +       xenbus_switch_state(dev, XenbusStateClosed);
61778 +       return 0;
61779 +}
61780 +
61781 +static int xenbus_register_driver_common(struct xenbus_driver *drv,
61782 +                                        struct xen_bus_type *bus)
61783 +{
61784 +       int ret;
61785 +
61786 +       drv->driver.name = drv->name;
61787 +       drv->driver.bus = &bus->bus;
61788 +       drv->driver.owner = drv->owner;
61789 +       drv->driver.probe = xenbus_dev_probe;
61790 +       drv->driver.remove = xenbus_dev_remove;
61791 +
61792 +       mutex_lock(&xenwatch_mutex);
61793 +       ret = driver_register(&drv->driver);
61794 +       mutex_unlock(&xenwatch_mutex);
61795 +       return ret;
61796 +}
61797 +
61798 +int xenbus_register_frontend(struct xenbus_driver *drv)
61799 +{
61800 +       drv->read_otherend_details = read_backend_details;
61801 +
61802 +       return xenbus_register_driver_common(drv, &xenbus_frontend);
61803 +}
61804 +EXPORT_SYMBOL_GPL(xenbus_register_frontend);
61805 +
61806 +int xenbus_register_backend(struct xenbus_driver *drv)
61807 +{
61808 +       drv->read_otherend_details = read_frontend_details;
61809 +
61810 +       return xenbus_register_driver_common(drv, &xenbus_backend);
61811 +}
61812 +EXPORT_SYMBOL_GPL(xenbus_register_backend);
61813 +
61814 +void xenbus_unregister_driver(struct xenbus_driver *drv)
61815 +{
61816 +       driver_unregister(&drv->driver);
61817 +}
61818 +EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
61819 +
61820 +struct xb_find_info
61821 +{
61822 +       struct xenbus_device *dev;
61823 +       const char *nodename;
61824 +};
61825 +
61826 +static int cmp_dev(struct device *dev, void *data)
61827 +{
61828 +       struct xenbus_device *xendev = to_xenbus_device(dev);
61829 +       struct xb_find_info *info = data;
61830 +
61831 +       if (!strcmp(xendev->nodename, info->nodename)) {
61832 +               info->dev = xendev;
61833 +               get_device(dev);
61834 +               return 1;
61835 +       }
61836 +       return 0;
61837 +}
61838 +
61839 +struct xenbus_device *xenbus_device_find(const char *nodename,
61840 +                                        struct bus_type *bus)
61841 +{
61842 +       struct xb_find_info info = { .dev = NULL, .nodename = nodename };
61843 +
61844 +       bus_for_each_dev(bus, NULL, &info, cmp_dev);
61845 +       return info.dev;
61846 +}
61847 +
61848 +static int cleanup_dev(struct device *dev, void *data)
61849 +{
61850 +       struct xenbus_device *xendev = to_xenbus_device(dev);
61851 +       struct xb_find_info *info = data;
61852 +       int len = strlen(info->nodename);
61853 +
61854 +       DPRINTK("%s", info->nodename);
61855 +
61856 +       /* Match the info->nodename path, or any subdirectory of that path. */
61857 +       if (strncmp(xendev->nodename, info->nodename, len))
61858 +               return 0;
61859 +
61860 +       /* If the node name is longer, ensure it really is a subdirectory. */
61861 +       if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
61862 +               return 0;
61863 +
61864 +       info->dev = xendev;
61865 +       get_device(dev);
61866 +       return 1;
61867 +}
61868 +
61869 +static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
61870 +{
61871 +       struct xb_find_info info = { .nodename = path };
61872 +
61873 +       do {
61874 +               info.dev = NULL;
61875 +               bus_for_each_dev(bus, NULL, &info, cleanup_dev);
61876 +               if (info.dev) {
61877 +                       device_unregister(&info.dev->dev);
61878 +                       put_device(&info.dev->dev);
61879 +               }
61880 +       } while (info.dev);
61881 +}
61882 +
61883 +static void xenbus_dev_release(struct device *dev)
61884 +{
61885 +       if (dev)
61886 +               kfree(to_xenbus_device(dev));
61887 +}
61888 +
61889 +/* Simplified asprintf. */
61890 +char *kasprintf(const char *fmt, ...)
61891 +{
61892 +       va_list ap;
61893 +       unsigned int len;
61894 +       char *p, dummy[1];
61895 +
61896 +       va_start(ap, fmt);
61897 +       /* FIXME: vsnprintf has a bug, NULL should work */
61898 +       len = vsnprintf(dummy, 0, fmt, ap);
61899 +       va_end(ap);
61900 +
61901 +       p = kmalloc(len + 1, GFP_KERNEL);
61902 +       if (!p)
61903 +               return NULL;
61904 +       va_start(ap, fmt);
61905 +       vsprintf(p, fmt, ap);
61906 +       va_end(ap);
61907 +       return p;
61908 +}
61909 +
61910 +static ssize_t xendev_show_nodename(struct device *dev,
61911 +                                   struct device_attribute *attr, char *buf)
61912 +{
61913 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
61914 +}
61915 +DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
61916 +
61917 +static ssize_t xendev_show_devtype(struct device *dev,
61918 +                                  struct device_attribute *attr, char *buf)
61919 +{
61920 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
61921 +}
61922 +DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
61923 +
61924 +
61925 +static int xenbus_probe_node(struct xen_bus_type *bus,
61926 +                            const char *type,
61927 +                            const char *nodename)
61928 +{
61929 +       int err;
61930 +       struct xenbus_device *xendev;
61931 +       size_t stringlen;
61932 +       char *tmpstring;
61933 +
61934 +       XenbusState state = xenbus_read_driver_state(nodename);
61935 +
61936 +       if (state != XenbusStateInitialising) {
61937 +               /* Device is not new, so ignore it.  This can happen if a
61938 +                  device is going away after switching to Closed.  */
61939 +               return 0;
61940 +       }
61941 +
61942 +       stringlen = strlen(nodename) + 1 + strlen(type) + 1;
61943 +       xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
61944 +       if (!xendev)
61945 +               return -ENOMEM;
61946 +
61947 +       /* Copy the strings into the extra space. */
61948 +
61949 +       tmpstring = (char *)(xendev + 1);
61950 +       strcpy(tmpstring, nodename);
61951 +       xendev->nodename = tmpstring;
61952 +
61953 +       tmpstring += strlen(tmpstring) + 1;
61954 +       strcpy(tmpstring, type);
61955 +       xendev->devicetype = tmpstring;
61956 +
61957 +       xendev->dev.parent = &bus->dev;
61958 +       xendev->dev.bus = &bus->bus;
61959 +       xendev->dev.release = xenbus_dev_release;
61960 +
61961 +       err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
61962 +       if (err)
61963 +               goto fail;
61964 +
61965 +       /* Register with generic device framework. */
61966 +       err = device_register(&xendev->dev);
61967 +       if (err)
61968 +               goto fail;
61969 +
61970 +       device_create_file(&xendev->dev, &dev_attr_nodename);
61971 +       device_create_file(&xendev->dev, &dev_attr_devtype);
61972 +
61973 +       return 0;
61974 +fail:
61975 +       kfree(xendev);
61976 +       return err;
61977 +}
61978 +
61979 +/* device/<typename>/<name> */
61980 +static int xenbus_probe_frontend(const char *type, const char *name)
61981 +{
61982 +       char *nodename;
61983 +       int err;
61984 +
61985 +       nodename = kasprintf("%s/%s/%s", xenbus_frontend.root, type, name);
61986 +       if (!nodename)
61987 +               return -ENOMEM;
61988 +
61989 +       DPRINTK("%s", nodename);
61990 +
61991 +       err = xenbus_probe_node(&xenbus_frontend, type, nodename);
61992 +       kfree(nodename);
61993 +       return err;
61994 +}
61995 +
61996 +/* backend/<typename>/<frontend-uuid>/<name> */
61997 +static int xenbus_probe_backend_unit(const char *dir,
61998 +                                    const char *type,
61999 +                                    const char *name)
62000 +{
62001 +       char *nodename;
62002 +       int err;
62003 +
62004 +       nodename = kasprintf("%s/%s", dir, name);
62005 +       if (!nodename)
62006 +               return -ENOMEM;
62007 +
62008 +       DPRINTK("%s\n", nodename);
62009 +
62010 +       err = xenbus_probe_node(&xenbus_backend, type, nodename);
62011 +       kfree(nodename);
62012 +       return err;
62013 +}
62014 +
62015 +/* backend/<typename>/<frontend-domid> */
62016 +static int xenbus_probe_backend(const char *type, const char *domid)
62017 +{
62018 +       char *nodename;
62019 +       int err = 0;
62020 +       char **dir;
62021 +       unsigned int i, dir_n = 0;
62022 +
62023 +       DPRINTK("");
62024 +
62025 +       nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid);
62026 +       if (!nodename)
62027 +               return -ENOMEM;
62028 +
62029 +       dir = xenbus_directory(XBT_NULL, nodename, "", &dir_n);
62030 +       if (IS_ERR(dir)) {
62031 +               kfree(nodename);
62032 +               return PTR_ERR(dir);
62033 +       }
62034 +
62035 +       for (i = 0; i < dir_n; i++) {
62036 +               err = xenbus_probe_backend_unit(nodename, type, dir[i]);
62037 +               if (err)
62038 +                       break;
62039 +       }
62040 +       kfree(dir);
62041 +       kfree(nodename);
62042 +       return err;
62043 +}
62044 +
62045 +static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
62046 +{
62047 +       int err = 0;
62048 +       char **dir;
62049 +       unsigned int dir_n = 0;
62050 +       int i;
62051 +
62052 +       dir = xenbus_directory(XBT_NULL, bus->root, type, &dir_n);
62053 +       if (IS_ERR(dir))
62054 +               return PTR_ERR(dir);
62055 +
62056 +       for (i = 0; i < dir_n; i++) {
62057 +               err = bus->probe(type, dir[i]);
62058 +               if (err)
62059 +                       break;
62060 +       }
62061 +       kfree(dir);
62062 +       return err;
62063 +}
62064 +
62065 +static int xenbus_probe_devices(struct xen_bus_type *bus)
62066 +{
62067 +       int err = 0;
62068 +       char **dir;
62069 +       unsigned int i, dir_n;
62070 +
62071 +       dir = xenbus_directory(XBT_NULL, bus->root, "", &dir_n);
62072 +       if (IS_ERR(dir))
62073 +               return PTR_ERR(dir);
62074 +
62075 +       for (i = 0; i < dir_n; i++) {
62076 +               err = xenbus_probe_device_type(bus, dir[i]);
62077 +               if (err)
62078 +                       break;
62079 +       }
62080 +       kfree(dir);
62081 +       return err;
62082 +}
62083 +
62084 +static unsigned int char_count(const char *str, char c)
62085 +{
62086 +       unsigned int i, ret = 0;
62087 +
62088 +       for (i = 0; str[i]; i++)
62089 +               if (str[i] == c)
62090 +                       ret++;
62091 +       return ret;
62092 +}
62093 +
62094 +static int strsep_len(const char *str, char c, unsigned int len)
62095 +{
62096 +       unsigned int i;
62097 +
62098 +       for (i = 0; str[i]; i++)
62099 +               if (str[i] == c) {
62100 +                       if (len == 0)
62101 +                               return i;
62102 +                       len--;
62103 +               }
62104 +       return (len == 0) ? i : -ERANGE;
62105 +}
62106 +
62107 +static void dev_changed(const char *node, struct xen_bus_type *bus)
62108 +{
62109 +       int exists, rootlen;
62110 +       struct xenbus_device *dev;
62111 +       char type[BUS_ID_SIZE];
62112 +       const char *p, *root;
62113 +
62114 +       if (char_count(node, '/') < 2)
62115 +               return;
62116 +
62117 +       exists = xenbus_exists(XBT_NULL, node, "");
62118 +       if (!exists) {
62119 +               xenbus_cleanup_devices(node, &bus->bus);
62120 +               return;
62121 +       }
62122 +
62123 +       /* backend/<type>/... or device/<type>/... */
62124 +       p = strchr(node, '/') + 1;
62125 +       snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
62126 +       type[BUS_ID_SIZE-1] = '\0';
62127 +
62128 +       rootlen = strsep_len(node, '/', bus->levels);
62129 +       if (rootlen < 0)
62130 +               return;
62131 +       root = kasprintf("%.*s", rootlen, node);
62132 +       if (!root)
62133 +               return;
62134 +
62135 +       dev = xenbus_device_find(root, &bus->bus);
62136 +       if (!dev)
62137 +               xenbus_probe_node(bus, type, root);
62138 +       else
62139 +               put_device(&dev->dev);
62140 +
62141 +       kfree(root);
62142 +}
62143 +
62144 +static void frontend_changed(struct xenbus_watch *watch,
62145 +                            const char **vec, unsigned int len)
62146 +{
62147 +       DPRINTK("");
62148 +
62149 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
62150 +}
62151 +
62152 +static void backend_changed(struct xenbus_watch *watch,
62153 +                           const char **vec, unsigned int len)
62154 +{
62155 +       DPRINTK("");
62156 +
62157 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
62158 +}
62159 +
62160 +/* We watch for devices appearing and vanishing. */
62161 +static struct xenbus_watch fe_watch = {
62162 +       .node = "device",
62163 +       .callback = frontend_changed,
62164 +};
62165 +
62166 +static struct xenbus_watch be_watch = {
62167 +       .node = "backend",
62168 +       .callback = backend_changed,
62169 +};
62170 +
62171 +static int suspend_dev(struct device *dev, void *data)
62172 +{
62173 +       int err = 0;
62174 +       struct xenbus_driver *drv;
62175 +       struct xenbus_device *xdev;
62176 +
62177 +       DPRINTK("");
62178 +
62179 +       if (dev->driver == NULL)
62180 +               return 0;
62181 +       drv = to_xenbus_driver(dev->driver);
62182 +       xdev = container_of(dev, struct xenbus_device, dev);
62183 +       if (drv->suspend)
62184 +               err = drv->suspend(xdev);
62185 +       if (err)
62186 +               printk(KERN_WARNING
62187 +                      "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
62188 +       return 0;
62189 +}
62190 +
62191 +static int resume_dev(struct device *dev, void *data)
62192 +{
62193 +       int err;
62194 +       struct xenbus_driver *drv;
62195 +       struct xenbus_device *xdev;
62196 +
62197 +       DPRINTK("");
62198 +
62199 +       if (dev->driver == NULL)
62200 +               return 0;
62201 +       drv = to_xenbus_driver(dev->driver);
62202 +       xdev = container_of(dev, struct xenbus_device, dev);
62203 +
62204 +       err = talk_to_otherend(xdev);
62205 +       if (err) {
62206 +               printk(KERN_WARNING
62207 +                      "xenbus: resume (talk_to_otherend) %s failed: %i\n",
62208 +                      dev->bus_id, err);
62209 +               return err;
62210 +       }
62211 +
62212 +       err = watch_otherend(xdev);
62213 +       if (err) {
62214 +               printk(KERN_WARNING
62215 +                      "xenbus_probe: resume (watch_otherend) %s failed: "
62216 +                      "%d.\n", dev->bus_id, err);
62217 +               return err;
62218 +       }
62219 +
62220 +       if (drv->resume)
62221 +               err = drv->resume(xdev);
62222 +       if (err)
62223 +               printk(KERN_WARNING
62224 +                      "xenbus: resume %s failed: %i\n", dev->bus_id, err);
62225 +       return err;
62226 +}
62227 +
62228 +void xenbus_suspend(void)
62229 +{
62230 +       DPRINTK("");
62231 +
62232 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
62233 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev);
62234 +       xs_suspend();
62235 +}
62236 +EXPORT_SYMBOL_GPL(xenbus_suspend);
62237 +
62238 +void xenbus_resume(void)
62239 +{
62240 +       xb_init_comms();
62241 +       xs_resume();
62242 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
62243 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev);
62244 +}
62245 +EXPORT_SYMBOL_GPL(xenbus_resume);
62246 +
62247 +
62248 +/* A flag to determine if xenstored is 'ready' (i.e. has started) */
62249 +int xenstored_ready = 0;
62250 +
62251 +
62252 +int register_xenstore_notifier(struct notifier_block *nb)
62253 +{
62254 +       int ret = 0;
62255 +
62256 +       if (xenstored_ready > 0)
62257 +               ret = nb->notifier_call(nb, 0, NULL);
62258 +       else
62259 +               notifier_chain_register(&xenstore_chain, nb);
62260 +
62261 +       return ret;
62262 +}
62263 +EXPORT_SYMBOL_GPL(register_xenstore_notifier);
62264 +
62265 +void unregister_xenstore_notifier(struct notifier_block *nb)
62266 +{
62267 +       notifier_chain_unregister(&xenstore_chain, nb);
62268 +}
62269 +EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
62270 +
62271 +
62272 +static int all_devices_ready_(struct device *dev, void *data)
62273 +{
62274 +       struct xenbus_device *xendev = to_xenbus_device(dev);
62275 +       int *result = data;
62276 +
62277 +       if (xendev->state != XenbusStateConnected) {
62278 +               result = 0;
62279 +               return 1;
62280 +       }
62281 +
62282 +       return 0;
62283 +}
62284 +
62285 +
62286 +static int all_devices_ready(void)
62287 +{
62288 +       int ready = 1;
62289 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, &ready,
62290 +                        all_devices_ready_);
62291 +       return ready;
62292 +}
62293 +
62294 +
62295 +void xenbus_probe(void *unused)
62296 +{
62297 +       int i;
62298 +
62299 +       BUG_ON((xenstored_ready <= 0));
62300 +
62301 +       /* Enumerate devices in xenstore. */
62302 +       xenbus_probe_devices(&xenbus_frontend);
62303 +       xenbus_probe_devices(&xenbus_backend);
62304 +
62305 +       /* Watch for changes. */
62306 +       register_xenbus_watch(&fe_watch);
62307 +       register_xenbus_watch(&be_watch);
62308 +
62309 +       /* Notify others that xenstore is up */
62310 +       notifier_call_chain(&xenstore_chain, 0, NULL);
62311 +
62312 +       /* On a 10 second timeout, waiting for all devices currently
62313 +          configured.  We need to do this to guarantee that the filesystems
62314 +          and / or network devices needed for boot are available, before we
62315 +          can allow the boot to proceed.
62316 +
62317 +          A possible improvement here would be to have the tools add a
62318 +          per-device flag to the store entry, indicating whether it is needed
62319 +          at boot time.  This would allow people who knew what they were
62320 +          doing to accelerate their boot slightly, but of course needs tools
62321 +          or manual intervention to set up those flags correctly.
62322 +        */
62323 +       for (i = 0; i < 10 * HZ; i++) {
62324 +               if (all_devices_ready())
62325 +                       return;
62326 +
62327 +               set_current_state(TASK_INTERRUPTIBLE);
62328 +               schedule_timeout(1);
62329 +       }
62330 +
62331 +       printk(KERN_WARNING
62332 +              "XENBUS: Timeout connecting to devices!\n");
62333 +}
62334 +
62335 +
62336 +static struct file_operations xsd_kva_fops;
62337 +static struct proc_dir_entry *xsd_kva_intf;
62338 +static struct proc_dir_entry *xsd_port_intf;
62339 +
62340 +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
62341 +{
62342 +       size_t size = vma->vm_end - vma->vm_start;
62343 +
62344 +       if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
62345 +               return -EINVAL;
62346 +
62347 +       if (remap_pfn_range(vma, vma->vm_start,
62348 +                           mfn_to_pfn(xen_start_info->store_mfn),
62349 +                           size, vma->vm_page_prot))
62350 +               return -EAGAIN;
62351 +
62352 +       return 0;
62353 +}
62354 +
62355 +static int xsd_kva_read(char *page, char **start, off_t off,
62356 +                        int count, int *eof, void *data)
62357 +{
62358 +       int len;
62359 +
62360 +       len  = sprintf(page, "0x%p", mfn_to_virt(xen_start_info->store_mfn));
62361 +       *eof = 1;
62362 +       return len;
62363 +}
62364 +
62365 +static int xsd_port_read(char *page, char **start, off_t off,
62366 +                        int count, int *eof, void *data)
62367 +{
62368 +       int len;
62369 +
62370 +       len  = sprintf(page, "%d", xen_start_info->store_evtchn);
62371 +       *eof = 1;
62372 +       return len;
62373 +}
62374 +
62375 +
62376 +static int __init xenbus_probe_init(void)
62377 +{
62378 +       int err = 0, dom0;
62379 +       unsigned long page = 0;
62380 +
62381 +       DPRINTK("");
62382 +
62383 +       if (xen_init() < 0) {
62384 +               DPRINTK("failed");
62385 +               return -ENODEV;
62386 +       }
62387 +
62388 +       /* Register ourselves with the kernel bus subsystem */
62389 +       bus_register(&xenbus_frontend.bus);
62390 +       bus_register(&xenbus_backend.bus);
62391 +
62392 +       /*
62393 +        * Domain0 doesn't have a store_evtchn or store_mfn yet.
62394 +        */
62395 +       dom0 = (xen_start_info->store_evtchn == 0);
62396 +
62397 +       if (dom0) {
62398 +               evtchn_op_t op = { 0 };
62399 +
62400 +               /* Allocate page. */
62401 +               page = get_zeroed_page(GFP_KERNEL);
62402 +               if (!page)
62403 +                       return -ENOMEM;
62404 +
62405 +               xen_start_info->store_mfn =
62406 +                       pfn_to_mfn(virt_to_phys((void *)page) >>
62407 +                                  PAGE_SHIFT);
62408 +
62409 +               /* Next allocate a local port which xenstored can bind to */
62410 +               op.cmd = EVTCHNOP_alloc_unbound;
62411 +               op.u.alloc_unbound.dom        = DOMID_SELF;
62412 +               op.u.alloc_unbound.remote_dom = 0;
62413 +
62414 +               err = HYPERVISOR_event_channel_op(&op);
62415 +               if (err == -ENOSYS)
62416 +                       goto err;
62417 +               BUG_ON(err);
62418 +               xen_start_info->store_evtchn = op.u.alloc_unbound.port;
62419 +
62420 +               /* And finally publish the above info in /proc/xen */
62421 +               xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
62422 +               if (xsd_kva_intf) {
62423 +                       memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
62424 +                              sizeof(xsd_kva_fops));
62425 +                       xsd_kva_fops.mmap = xsd_kva_mmap;
62426 +                       xsd_kva_intf->proc_fops = &xsd_kva_fops;
62427 +                       xsd_kva_intf->read_proc = xsd_kva_read;
62428 +               }
62429 +               xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
62430 +               if (xsd_port_intf)
62431 +                       xsd_port_intf->read_proc = xsd_port_read;
62432 +       } else
62433 +               xenstored_ready = 1;
62434 +
62435 +       /* Initialize the interface to xenstore. */
62436 +       err = xs_init();
62437 +       if (err) {
62438 +               printk(KERN_WARNING
62439 +                      "XENBUS: Error initializing xenstore comms: %i\n", err);
62440 +               goto err;
62441 +       }
62442 +
62443 +       /* Register ourselves with the kernel device subsystem */
62444 +       device_register(&xenbus_frontend.dev);
62445 +       device_register(&xenbus_backend.dev);
62446 +
62447 +       if (!dom0)
62448 +               xenbus_probe(NULL);
62449 +
62450 +       return 0;
62451 +
62452 + err:
62453 +       if (page)
62454 +               free_page(page);
62455 +
62456 +       /*
62457 +         * Do not unregister the xenbus front/backend buses here. The
62458 +         * buses must exist because front/backend drivers will use
62459 +         * them when they are registered.
62460 +         */
62461 +
62462 +       return err;
62463 +}
62464 +
62465 +postcore_initcall(xenbus_probe_init);
62466 +
62467 +/*
62468 + * Local variables:
62469 + *  c-file-style: "linux"
62470 + *  indent-tabs-mode: t
62471 + *  c-indent-level: 8
62472 + *  c-basic-offset: 8
62473 + *  tab-width: 8
62474 + * End:
62475 + */
62476 diff -Nurp ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_xs.c tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_xs.c
62477 --- ref-linux-2.6.16.9/drivers/xen/xenbus/xenbus_xs.c   1970-01-01 01:00:00.000000000 +0100
62478 +++ tmp-linux-2.6-xen.patch/drivers/xen/xenbus/xenbus_xs.c      2006-04-10 00:05:52.000000000 +0200
62479 @@ -0,0 +1,856 @@
62480 +/******************************************************************************
62481 + * xenbus_xs.c
62482 + *
62483 + * This is the kernel equivalent of the "xs" library.  We don't need everything
62484 + * and we use xenbus_comms for communication.
62485 + *
62486 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
62487 + * 
62488 + * This program is free software; you can redistribute it and/or
62489 + * modify it under the terms of the GNU General Public License version 2
62490 + * as published by the Free Software Foundation; or, when distributed
62491 + * separately from the Linux kernel or incorporated into other
62492 + * software packages, subject to the following license:
62493 + * 
62494 + * Permission is hereby granted, free of charge, to any person obtaining a copy
62495 + * of this source file (the "Software"), to deal in the Software without
62496 + * restriction, including without limitation the rights to use, copy, modify,
62497 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62498 + * and to permit persons to whom the Software is furnished to do so, subject to
62499 + * the following conditions:
62500 + * 
62501 + * The above copyright notice and this permission notice shall be included in
62502 + * all copies or substantial portions of the Software.
62503 + * 
62504 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62505 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62506 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62507 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62508 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62509 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62510 + * IN THE SOFTWARE.
62511 + */
62512 +
62513 +#include <linux/unistd.h>
62514 +#include <linux/errno.h>
62515 +#include <linux/types.h>
62516 +#include <linux/uio.h>
62517 +#include <linux/kernel.h>
62518 +#include <linux/string.h>
62519 +#include <linux/err.h>
62520 +#include <linux/slab.h>
62521 +#include <linux/fcntl.h>
62522 +#include <linux/kthread.h>
62523 +#include <linux/rwsem.h>
62524 +#include <xen/xenbus.h>
62525 +#include "xenbus_comms.h"
62526 +
62527 +/* xenbus_probe.c */
62528 +extern char *kasprintf(const char *fmt, ...);
62529 +
62530 +struct xs_stored_msg {
62531 +       struct list_head list;
62532 +
62533 +       struct xsd_sockmsg hdr;
62534 +
62535 +       union {
62536 +               /* Queued replies. */
62537 +               struct {
62538 +                       char *body;
62539 +               } reply;
62540 +
62541 +               /* Queued watch events. */
62542 +               struct {
62543 +                       struct xenbus_watch *handle;
62544 +                       char **vec;
62545 +                       unsigned int vec_size;
62546 +               } watch;
62547 +       } u;
62548 +};
62549 +
62550 +struct xs_handle {
62551 +       /* A list of replies. Currently only one will ever be outstanding. */
62552 +       struct list_head reply_list;
62553 +       spinlock_t reply_lock;
62554 +       wait_queue_head_t reply_waitq;
62555 +
62556 +       /* One request at a time. */
62557 +       struct mutex request_mutex;
62558 +
62559 +       /* Protect transactions against save/restore. */
62560 +       struct rw_semaphore suspend_mutex;
62561 +};
62562 +
62563 +static struct xs_handle xs_state;
62564 +
62565 +/* List of registered watches, and a lock to protect it. */
62566 +static LIST_HEAD(watches);
62567 +static DEFINE_SPINLOCK(watches_lock);
62568 +
62569 +/* List of pending watch callback events, and a lock to protect it. */
62570 +static LIST_HEAD(watch_events);
62571 +static DEFINE_SPINLOCK(watch_events_lock);
62572 +
62573 +/*
62574 + * Details of the xenwatch callback kernel thread. The thread waits on the
62575 + * watch_events_waitq for work to do (queued on watch_events list). When it
62576 + * wakes up it acquires the xenwatch_mutex before reading the list and
62577 + * carrying out work.
62578 + */
62579 +static pid_t xenwatch_pid;
62580 +/* static */ DEFINE_MUTEX(xenwatch_mutex);
62581 +static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
62582 +
62583 +static int get_error(const char *errorstring)
62584 +{
62585 +       unsigned int i;
62586 +
62587 +       for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
62588 +               if (i == ARRAY_SIZE(xsd_errors) - 1) {
62589 +                       printk(KERN_WARNING
62590 +                              "XENBUS xen store gave: unknown error %s",
62591 +                              errorstring);
62592 +                       return EINVAL;
62593 +               }
62594 +       }
62595 +       return xsd_errors[i].errnum;
62596 +}
62597 +
62598 +static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
62599 +{
62600 +       struct xs_stored_msg *msg;
62601 +       char *body;
62602 +
62603 +       spin_lock(&xs_state.reply_lock);
62604 +
62605 +       while (list_empty(&xs_state.reply_list)) {
62606 +               spin_unlock(&xs_state.reply_lock);
62607 +               /* XXX FIXME: Avoid synchronous wait for response here. */
62608 +               wait_event(xs_state.reply_waitq,
62609 +                          !list_empty(&xs_state.reply_list));
62610 +               spin_lock(&xs_state.reply_lock);
62611 +       }
62612 +
62613 +       msg = list_entry(xs_state.reply_list.next,
62614 +                        struct xs_stored_msg, list);
62615 +       list_del(&msg->list);
62616 +
62617 +       spin_unlock(&xs_state.reply_lock);
62618 +
62619 +       *type = msg->hdr.type;
62620 +       if (len)
62621 +               *len = msg->hdr.len;
62622 +       body = msg->u.reply.body;
62623 +
62624 +       kfree(msg);
62625 +
62626 +       return body;
62627 +}
62628 +
62629 +/* Emergency write. */
62630 +void xenbus_debug_write(const char *str, unsigned int count)
62631 +{
62632 +       struct xsd_sockmsg msg = { 0 };
62633 +
62634 +       msg.type = XS_DEBUG;
62635 +       msg.len = sizeof("print") + count + 1;
62636 +
62637 +       mutex_lock(&xs_state.request_mutex);
62638 +       xb_write(&msg, sizeof(msg));
62639 +       xb_write("print", sizeof("print"));
62640 +       xb_write(str, count);
62641 +       xb_write("", 1);
62642 +       mutex_unlock(&xs_state.request_mutex);
62643 +}
62644 +
62645 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
62646 +{
62647 +       void *ret;
62648 +       struct xsd_sockmsg req_msg = *msg;
62649 +       int err;
62650 +
62651 +       if (req_msg.type == XS_TRANSACTION_START)
62652 +               down_read(&xs_state.suspend_mutex);
62653 +
62654 +       mutex_lock(&xs_state.request_mutex);
62655 +
62656 +       err = xb_write(msg, sizeof(*msg) + msg->len);
62657 +       if (err) {
62658 +               msg->type = XS_ERROR;
62659 +               ret = ERR_PTR(err);
62660 +       } else
62661 +               ret = read_reply(&msg->type, &msg->len);
62662 +
62663 +       mutex_unlock(&xs_state.request_mutex);
62664 +
62665 +       if ((msg->type == XS_TRANSACTION_END) ||
62666 +           ((req_msg.type == XS_TRANSACTION_START) &&
62667 +            (msg->type == XS_ERROR)))
62668 +               up_read(&xs_state.suspend_mutex);
62669 +
62670 +       return ret;
62671 +}
62672 +
62673 +/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
62674 +static void *xs_talkv(xenbus_transaction_t t,
62675 +                     enum xsd_sockmsg_type type,
62676 +                     const struct kvec *iovec,
62677 +                     unsigned int num_vecs,
62678 +                     unsigned int *len)
62679 +{
62680 +       struct xsd_sockmsg msg;
62681 +       void *ret = NULL;
62682 +       unsigned int i;
62683 +       int err;
62684 +
62685 +       msg.tx_id = t;
62686 +       msg.req_id = 0;
62687 +       msg.type = type;
62688 +       msg.len = 0;
62689 +       for (i = 0; i < num_vecs; i++)
62690 +               msg.len += iovec[i].iov_len;
62691 +
62692 +       mutex_lock(&xs_state.request_mutex);
62693 +
62694 +       err = xb_write(&msg, sizeof(msg));
62695 +       if (err) {
62696 +               mutex_unlock(&xs_state.request_mutex);
62697 +               return ERR_PTR(err);
62698 +       }
62699 +
62700 +       for (i = 0; i < num_vecs; i++) {
62701 +               err = xb_write(iovec[i].iov_base, iovec[i].iov_len);;
62702 +               if (err) {
62703 +                       mutex_unlock(&xs_state.request_mutex);
62704 +                       return ERR_PTR(err);
62705 +               }
62706 +       }
62707 +
62708 +       ret = read_reply(&msg.type, len);
62709 +
62710 +       mutex_unlock(&xs_state.request_mutex);
62711 +
62712 +       if (IS_ERR(ret))
62713 +               return ret;
62714 +
62715 +       if (msg.type == XS_ERROR) {
62716 +               err = get_error(ret);
62717 +               kfree(ret);
62718 +               return ERR_PTR(-err);
62719 +       }
62720 +
62721 +       if (msg.type != type) {
62722 +               if (printk_ratelimit())
62723 +                       printk(KERN_WARNING
62724 +                              "XENBUS unexpected type [%d], expected [%d]\n",
62725 +                              msg.type, type);
62726 +               kfree(ret);
62727 +               return ERR_PTR(-EINVAL);
62728 +       }
62729 +       return ret;
62730 +}
62731 +
62732 +/* Simplified version of xs_talkv: single message. */
62733 +static void *xs_single(xenbus_transaction_t t,
62734 +                      enum xsd_sockmsg_type type,
62735 +                      const char *string,
62736 +                      unsigned int *len)
62737 +{
62738 +       struct kvec iovec;
62739 +
62740 +       iovec.iov_base = (void *)string;
62741 +       iovec.iov_len = strlen(string) + 1;
62742 +       return xs_talkv(t, type, &iovec, 1, len);
62743 +}
62744 +
62745 +/* Many commands only need an ack, don't care what it says. */
62746 +static int xs_error(char *reply)
62747 +{
62748 +       if (IS_ERR(reply))
62749 +               return PTR_ERR(reply);
62750 +       kfree(reply);
62751 +       return 0;
62752 +}
62753 +
62754 +static unsigned int count_strings(const char *strings, unsigned int len)
62755 +{
62756 +       unsigned int num;
62757 +       const char *p;
62758 +
62759 +       for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
62760 +               num++;
62761 +
62762 +       return num;
62763 +}
62764 +
62765 +/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
62766 +static char *join(const char *dir, const char *name)
62767 +{
62768 +       char *buffer;
62769 +
62770 +       if (strlen(name) == 0)
62771 +               buffer = kasprintf("%s", dir);
62772 +       else
62773 +               buffer = kasprintf("%s/%s", dir, name);
62774 +       return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
62775 +}
62776 +
62777 +static char **split(char *strings, unsigned int len, unsigned int *num)
62778 +{
62779 +       char *p, **ret;
62780 +
62781 +       /* Count the strings. */
62782 +       *num = count_strings(strings, len);
62783 +
62784 +       /* Transfer to one big alloc for easy freeing. */
62785 +       ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
62786 +       if (!ret) {
62787 +               kfree(strings);
62788 +               return ERR_PTR(-ENOMEM);
62789 +       }
62790 +       memcpy(&ret[*num], strings, len);
62791 +       kfree(strings);
62792 +
62793 +       strings = (char *)&ret[*num];
62794 +       for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
62795 +               ret[(*num)++] = p;
62796 +
62797 +       return ret;
62798 +}
62799 +
62800 +char **xenbus_directory(xenbus_transaction_t t,
62801 +                       const char *dir, const char *node, unsigned int *num)
62802 +{
62803 +       char *strings, *path;
62804 +       unsigned int len;
62805 +
62806 +       path = join(dir, node);
62807 +       if (IS_ERR(path))
62808 +               return (char **)path;
62809 +
62810 +       strings = xs_single(t, XS_DIRECTORY, path, &len);
62811 +       kfree(path);
62812 +       if (IS_ERR(strings))
62813 +               return (char **)strings;
62814 +
62815 +       return split(strings, len, num);
62816 +}
62817 +EXPORT_SYMBOL_GPL(xenbus_directory);
62818 +
62819 +/* Check if a path exists. Return 1 if it does. */
62820 +int xenbus_exists(xenbus_transaction_t t,
62821 +                 const char *dir, const char *node)
62822 +{
62823 +       char **d;
62824 +       int dir_n;
62825 +
62826 +       d = xenbus_directory(t, dir, node, &dir_n);
62827 +       if (IS_ERR(d))
62828 +               return 0;
62829 +       kfree(d);
62830 +       return 1;
62831 +}
62832 +EXPORT_SYMBOL_GPL(xenbus_exists);
62833 +
62834 +/* Get the value of a single file.
62835 + * Returns a kmalloced value: call free() on it after use.
62836 + * len indicates length in bytes.
62837 + */
62838 +void *xenbus_read(xenbus_transaction_t t,
62839 +                 const char *dir, const char *node, unsigned int *len)
62840 +{
62841 +       char *path;
62842 +       void *ret;
62843 +
62844 +       path = join(dir, node);
62845 +       if (IS_ERR(path))
62846 +               return (void *)path;
62847 +
62848 +       ret = xs_single(t, XS_READ, path, len);
62849 +       kfree(path);
62850 +       return ret;
62851 +}
62852 +EXPORT_SYMBOL_GPL(xenbus_read);
62853 +
62854 +/* Write the value of a single file.
62855 + * Returns -err on failure.
62856 + */
62857 +int xenbus_write(xenbus_transaction_t t,
62858 +                const char *dir, const char *node, const char *string)
62859 +{
62860 +       const char *path;
62861 +       struct kvec iovec[2];
62862 +       int ret;
62863 +
62864 +       path = join(dir, node);
62865 +       if (IS_ERR(path))
62866 +               return PTR_ERR(path);
62867 +
62868 +       iovec[0].iov_base = (void *)path;
62869 +       iovec[0].iov_len = strlen(path) + 1;
62870 +       iovec[1].iov_base = (void *)string;
62871 +       iovec[1].iov_len = strlen(string);
62872 +
62873 +       ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
62874 +       kfree(path);
62875 +       return ret;
62876 +}
62877 +EXPORT_SYMBOL_GPL(xenbus_write);
62878 +
62879 +/* Create a new directory. */
62880 +int xenbus_mkdir(xenbus_transaction_t t,
62881 +                const char *dir, const char *node)
62882 +{
62883 +       char *path;
62884 +       int ret;
62885 +
62886 +       path = join(dir, node);
62887 +       if (IS_ERR(path))
62888 +               return PTR_ERR(path);
62889 +
62890 +       ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
62891 +       kfree(path);
62892 +       return ret;
62893 +}
62894 +EXPORT_SYMBOL_GPL(xenbus_mkdir);
62895 +
62896 +/* Destroy a file or directory (directories must be empty). */
62897 +int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node)
62898 +{
62899 +       char *path;
62900 +       int ret;
62901 +
62902 +       path = join(dir, node);
62903 +       if (IS_ERR(path))
62904 +               return PTR_ERR(path);
62905 +
62906 +       ret = xs_error(xs_single(t, XS_RM, path, NULL));
62907 +       kfree(path);
62908 +       return ret;
62909 +}
62910 +EXPORT_SYMBOL_GPL(xenbus_rm);
62911 +
62912 +/* Start a transaction: changes by others will not be seen during this
62913 + * transaction, and changes will not be visible to others until end.
62914 + */
62915 +int xenbus_transaction_start(xenbus_transaction_t *t)
62916 +{
62917 +       char *id_str;
62918 +
62919 +       down_read(&xs_state.suspend_mutex);
62920 +
62921 +       id_str = xs_single(XBT_NULL, XS_TRANSACTION_START, "", NULL);
62922 +       if (IS_ERR(id_str)) {
62923 +               up_read(&xs_state.suspend_mutex);
62924 +               return PTR_ERR(id_str);
62925 +       }
62926 +
62927 +       *t = simple_strtoul(id_str, NULL, 0);
62928 +       kfree(id_str);
62929 +       return 0;
62930 +}
62931 +EXPORT_SYMBOL_GPL(xenbus_transaction_start);
62932 +
62933 +/* End a transaction.
62934 + * If abandon is true, transaction is discarded instead of committed.
62935 + */
62936 +int xenbus_transaction_end(xenbus_transaction_t t, int abort)
62937 +{
62938 +       char abortstr[2];
62939 +       int err;
62940 +
62941 +       if (abort)
62942 +               strcpy(abortstr, "F");
62943 +       else
62944 +               strcpy(abortstr, "T");
62945 +
62946 +       err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
62947 +
62948 +       up_read(&xs_state.suspend_mutex);
62949 +
62950 +       return err;
62951 +}
62952 +EXPORT_SYMBOL_GPL(xenbus_transaction_end);
62953 +
62954 +/* Single read and scanf: returns -errno or num scanned. */
62955 +int xenbus_scanf(xenbus_transaction_t t,
62956 +                const char *dir, const char *node, const char *fmt, ...)
62957 +{
62958 +       va_list ap;
62959 +       int ret;
62960 +       char *val;
62961 +
62962 +       val = xenbus_read(t, dir, node, NULL);
62963 +       if (IS_ERR(val))
62964 +               return PTR_ERR(val);
62965 +
62966 +       va_start(ap, fmt);
62967 +       ret = vsscanf(val, fmt, ap);
62968 +       va_end(ap);
62969 +       kfree(val);
62970 +       /* Distinctive errno. */
62971 +       if (ret == 0)
62972 +               return -ERANGE;
62973 +       return ret;
62974 +}
62975 +EXPORT_SYMBOL_GPL(xenbus_scanf);
62976 +
62977 +/* Single printf and write: returns -errno or 0. */
62978 +int xenbus_printf(xenbus_transaction_t t,
62979 +                 const char *dir, const char *node, const char *fmt, ...)
62980 +{
62981 +       va_list ap;
62982 +       int ret;
62983 +#define PRINTF_BUFFER_SIZE 4096
62984 +       char *printf_buffer;
62985 +
62986 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
62987 +       if (printf_buffer == NULL)
62988 +               return -ENOMEM;
62989 +
62990 +       va_start(ap, fmt);
62991 +       ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
62992 +       va_end(ap);
62993 +
62994 +       BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
62995 +       ret = xenbus_write(t, dir, node, printf_buffer);
62996 +
62997 +       kfree(printf_buffer);
62998 +
62999 +       return ret;
63000 +}
63001 +EXPORT_SYMBOL_GPL(xenbus_printf);
63002 +
63003 +/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
63004 +int xenbus_gather(xenbus_transaction_t t, const char *dir, ...)
63005 +{
63006 +       va_list ap;
63007 +       const char *name;
63008 +       int ret = 0;
63009 +
63010 +       va_start(ap, dir);
63011 +       while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
63012 +               const char *fmt = va_arg(ap, char *);
63013 +               void *result = va_arg(ap, void *);
63014 +               char *p;
63015 +
63016 +               p = xenbus_read(t, dir, name, NULL);
63017 +               if (IS_ERR(p)) {
63018 +                       ret = PTR_ERR(p);
63019 +                       break;
63020 +               }
63021 +               if (fmt) {
63022 +                       if (sscanf(p, fmt, result) == 0)
63023 +                               ret = -EINVAL;
63024 +                       kfree(p);
63025 +               } else
63026 +                       *(char **)result = p;
63027 +       }
63028 +       va_end(ap);
63029 +       return ret;
63030 +}
63031 +EXPORT_SYMBOL_GPL(xenbus_gather);
63032 +
63033 +static int xs_watch(const char *path, const char *token)
63034 +{
63035 +       struct kvec iov[2];
63036 +
63037 +       iov[0].iov_base = (void *)path;
63038 +       iov[0].iov_len = strlen(path) + 1;
63039 +       iov[1].iov_base = (void *)token;
63040 +       iov[1].iov_len = strlen(token) + 1;
63041 +
63042 +       return xs_error(xs_talkv(XBT_NULL, XS_WATCH, iov,
63043 +                                ARRAY_SIZE(iov), NULL));
63044 +}
63045 +
63046 +static int xs_unwatch(const char *path, const char *token)
63047 +{
63048 +       struct kvec iov[2];
63049 +
63050 +       iov[0].iov_base = (char *)path;
63051 +       iov[0].iov_len = strlen(path) + 1;
63052 +       iov[1].iov_base = (char *)token;
63053 +       iov[1].iov_len = strlen(token) + 1;
63054 +
63055 +       return xs_error(xs_talkv(XBT_NULL, XS_UNWATCH, iov,
63056 +                                ARRAY_SIZE(iov), NULL));
63057 +}
63058 +
63059 +static struct xenbus_watch *find_watch(const char *token)
63060 +{
63061 +       struct xenbus_watch *i, *cmp;
63062 +
63063 +       cmp = (void *)simple_strtoul(token, NULL, 16);
63064 +
63065 +       list_for_each_entry(i, &watches, list)
63066 +               if (i == cmp)
63067 +                       return i;
63068 +
63069 +       return NULL;
63070 +}
63071 +
63072 +/* Register callback to watch this node. */
63073 +int register_xenbus_watch(struct xenbus_watch *watch)
63074 +{
63075 +       /* Pointer in ascii is the token. */
63076 +       char token[sizeof(watch) * 2 + 1];
63077 +       int err;
63078 +
63079 +       sprintf(token, "%lX", (long)watch);
63080 +
63081 +       down_read(&xs_state.suspend_mutex);
63082 +
63083 +       spin_lock(&watches_lock);
63084 +       BUG_ON(find_watch(token));
63085 +       list_add(&watch->list, &watches);
63086 +       spin_unlock(&watches_lock);
63087 +
63088 +       err = xs_watch(watch->node, token);
63089 +
63090 +       /* Ignore errors due to multiple registration. */
63091 +       if ((err != 0) && (err != -EEXIST)) {
63092 +               spin_lock(&watches_lock);
63093 +               list_del(&watch->list);
63094 +               spin_unlock(&watches_lock);
63095 +       }
63096 +
63097 +       up_read(&xs_state.suspend_mutex);
63098 +
63099 +       return err;
63100 +}
63101 +EXPORT_SYMBOL_GPL(register_xenbus_watch);
63102 +
63103 +void unregister_xenbus_watch(struct xenbus_watch *watch)
63104 +{
63105 +       struct xs_stored_msg *msg, *tmp;
63106 +       char token[sizeof(watch) * 2 + 1];
63107 +       int err;
63108 +
63109 +       sprintf(token, "%lX", (long)watch);
63110 +
63111 +       down_read(&xs_state.suspend_mutex);
63112 +
63113 +       spin_lock(&watches_lock);
63114 +       BUG_ON(!find_watch(token));
63115 +       list_del(&watch->list);
63116 +       spin_unlock(&watches_lock);
63117 +
63118 +       err = xs_unwatch(watch->node, token);
63119 +       if (err)
63120 +               printk(KERN_WARNING
63121 +                      "XENBUS Failed to release watch %s: %i\n",
63122 +                      watch->node, err);
63123 +
63124 +       up_read(&xs_state.suspend_mutex);
63125 +
63126 +       /* Cancel pending watch events. */
63127 +       spin_lock(&watch_events_lock);
63128 +       list_for_each_entry_safe(msg, tmp, &watch_events, list) {
63129 +               if (msg->u.watch.handle != watch)
63130 +                       continue;
63131 +               list_del(&msg->list);
63132 +               kfree(msg->u.watch.vec);
63133 +               kfree(msg);
63134 +       }
63135 +       spin_unlock(&watch_events_lock);
63136 +
63137 +       /* Flush any currently-executing callback, unless we are it. :-) */
63138 +       if (current->pid != xenwatch_pid) {
63139 +               mutex_lock(&xenwatch_mutex);
63140 +               mutex_unlock(&xenwatch_mutex);
63141 +       }
63142 +}
63143 +EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
63144 +
63145 +void xs_suspend(void)
63146 +{
63147 +       down_write(&xs_state.suspend_mutex);
63148 +       mutex_lock(&xs_state.request_mutex);
63149 +}
63150 +
63151 +void xs_resume(void)
63152 +{
63153 +       struct xenbus_watch *watch;
63154 +       char token[sizeof(watch) * 2 + 1];
63155 +
63156 +       mutex_unlock(&xs_state.request_mutex);
63157 +
63158 +       /* No need for watches_lock: the suspend_mutex is sufficient. */
63159 +       list_for_each_entry(watch, &watches, list) {
63160 +               sprintf(token, "%lX", (long)watch);
63161 +               xs_watch(watch->node, token);
63162 +       }
63163 +
63164 +       up_write(&xs_state.suspend_mutex);
63165 +}
63166 +
63167 +static int xenwatch_handle_callback(void *data)
63168 +{
63169 +       struct xs_stored_msg *msg = data;
63170 +
63171 +       msg->u.watch.handle->callback(msg->u.watch.handle,
63172 +                                     (const char **)msg->u.watch.vec,
63173 +                                     msg->u.watch.vec_size);
63174 +
63175 +       kfree(msg->u.watch.vec);
63176 +       kfree(msg);
63177 +
63178 +       /* Kill this kthread if we were spawned just for this callback. */
63179 +       if (current->pid != xenwatch_pid)
63180 +               do_exit(0);
63181 +
63182 +       return 0;
63183 +}
63184 +
63185 +static int xenwatch_thread(void *unused)
63186 +{
63187 +       struct list_head *ent;
63188 +       struct xs_stored_msg *msg;
63189 +
63190 +       for (;;) {
63191 +               wait_event_interruptible(watch_events_waitq,
63192 +                                        !list_empty(&watch_events));
63193 +
63194 +               if (kthread_should_stop())
63195 +                       break;
63196 +
63197 +               mutex_lock(&xenwatch_mutex);
63198 +
63199 +               spin_lock(&watch_events_lock);
63200 +               ent = watch_events.next;
63201 +               if (ent != &watch_events)
63202 +                       list_del(ent);
63203 +               spin_unlock(&watch_events_lock);
63204 +
63205 +               if (ent != &watch_events) {
63206 +                       msg = list_entry(ent, struct xs_stored_msg, list);
63207 +                       if (msg->u.watch.handle->flags & XBWF_new_thread)
63208 +                               kthread_run(xenwatch_handle_callback,
63209 +                                           msg, "xenwatch_cb");
63210 +                       else
63211 +                               xenwatch_handle_callback(msg);
63212 +               }
63213 +
63214 +               mutex_unlock(&xenwatch_mutex);
63215 +       }
63216 +
63217 +       return 0;
63218 +}
63219 +
63220 +static int process_msg(void)
63221 +{
63222 +       struct xs_stored_msg *msg;
63223 +       char *body;
63224 +       int err;
63225 +
63226 +       msg = kmalloc(sizeof(*msg), GFP_KERNEL);
63227 +       if (msg == NULL)
63228 +               return -ENOMEM;
63229 +
63230 +       err = xb_read(&msg->hdr, sizeof(msg->hdr));
63231 +       if (err) {
63232 +               kfree(msg);
63233 +               return err;
63234 +       }
63235 +
63236 +       body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
63237 +       if (body == NULL) {
63238 +               kfree(msg);
63239 +               return -ENOMEM;
63240 +       }
63241 +
63242 +       err = xb_read(body, msg->hdr.len);
63243 +       if (err) {
63244 +               kfree(body);
63245 +               kfree(msg);
63246 +               return err;
63247 +       }
63248 +       body[msg->hdr.len] = '\0';
63249 +
63250 +       if (msg->hdr.type == XS_WATCH_EVENT) {
63251 +               msg->u.watch.vec = split(body, msg->hdr.len,
63252 +                                        &msg->u.watch.vec_size);
63253 +               if (IS_ERR(msg->u.watch.vec)) {
63254 +                       kfree(msg);
63255 +                       return PTR_ERR(msg->u.watch.vec);
63256 +               }
63257 +
63258 +               spin_lock(&watches_lock);
63259 +               msg->u.watch.handle = find_watch(
63260 +                       msg->u.watch.vec[XS_WATCH_TOKEN]);
63261 +               if (msg->u.watch.handle != NULL) {
63262 +                       spin_lock(&watch_events_lock);
63263 +                       list_add_tail(&msg->list, &watch_events);
63264 +                       wake_up(&watch_events_waitq);
63265 +                       spin_unlock(&watch_events_lock);
63266 +               } else {
63267 +                       kfree(msg->u.watch.vec);
63268 +                       kfree(msg);
63269 +               }
63270 +               spin_unlock(&watches_lock);
63271 +       } else {
63272 +               msg->u.reply.body = body;
63273 +               spin_lock(&xs_state.reply_lock);
63274 +               list_add_tail(&msg->list, &xs_state.reply_list);
63275 +               spin_unlock(&xs_state.reply_lock);
63276 +               wake_up(&xs_state.reply_waitq);
63277 +       }
63278 +
63279 +       return 0;
63280 +}
63281 +
63282 +static int xenbus_thread(void *unused)
63283 +{
63284 +       int err;
63285 +
63286 +       for (;;) {
63287 +               err = process_msg();
63288 +               if (err)
63289 +                       printk(KERN_WARNING "XENBUS error %d while reading "
63290 +                              "message\n", err);
63291 +               if (kthread_should_stop())
63292 +                       break;
63293 +       }
63294 +
63295 +       return 0;
63296 +}
63297 +
63298 +int xs_init(void)
63299 +{
63300 +       int err;
63301 +       struct task_struct *task;
63302 +
63303 +       INIT_LIST_HEAD(&xs_state.reply_list);
63304 +       spin_lock_init(&xs_state.reply_lock);
63305 +       init_waitqueue_head(&xs_state.reply_waitq);
63306 +
63307 +       mutex_init(&xs_state.request_mutex);
63308 +       init_rwsem(&xs_state.suspend_mutex);
63309 +
63310 +       /* Initialize the shared memory rings to talk to xenstored */
63311 +       err = xb_init_comms();
63312 +       if (err)
63313 +               return err;
63314 +
63315 +       task = kthread_run(xenwatch_thread, NULL, "xenwatch");
63316 +       if (IS_ERR(task))
63317 +               return PTR_ERR(task);
63318 +       xenwatch_pid = task->pid;
63319 +
63320 +       task = kthread_run(xenbus_thread, NULL, "xenbus");
63321 +       if (IS_ERR(task))
63322 +               return PTR_ERR(task);
63323 +
63324 +       return 0;
63325 +}
63326 +
63327 +/*
63328 + * Local variables:
63329 + *  c-file-style: "linux"
63330 + *  indent-tabs-mode: t
63331 + *  c-indent-level: 8
63332 + *  c-basic-offset: 8
63333 + *  tab-width: 8
63334 + * End:
63335 + */
63336 diff -Nurp ref-linux-2.6.16.9/fs/Kconfig tmp-linux-2.6-xen.patch/fs/Kconfig
63337 --- ref-linux-2.6.16.9/fs/Kconfig       2006-04-19 08:10:14.000000000 +0200
63338 +++ tmp-linux-2.6-xen.patch/fs/Kconfig  2006-04-10 00:05:52.000000000 +0200
63339 @@ -841,6 +841,7 @@ config TMPFS
63340  config HUGETLBFS
63341         bool "HugeTLB file system support"
63342         depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
63343 +       depends !XEN
63344  
63345  config HUGETLB_PAGE
63346         def_bool HUGETLBFS
63347 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/a.out.h tmp-linux-2.6-xen.patch/include/asm-i386/a.out.h
63348 --- ref-linux-2.6.16.9/include/asm-i386/a.out.h 2006-04-19 08:10:14.000000000 +0200
63349 +++ tmp-linux-2.6-xen.patch/include/asm-i386/a.out.h    2006-04-10 00:05:52.000000000 +0200
63350 @@ -19,7 +19,7 @@ struct exec
63351  
63352  #ifdef __KERNEL__
63353  
63354 -#define STACK_TOP      TASK_SIZE
63355 +#define STACK_TOP      (TASK_SIZE - 3*PAGE_SIZE)
63356  
63357  #endif
63358  
63359 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/apic.h tmp-linux-2.6-xen.patch/include/asm-i386/apic.h
63360 --- ref-linux-2.6.16.9/include/asm-i386/apic.h  2006-04-19 08:10:14.000000000 +0200
63361 +++ tmp-linux-2.6-xen.patch/include/asm-i386/apic.h     2006-04-10 00:05:52.000000000 +0200
63362 @@ -132,10 +132,12 @@ extern unsigned int nmi_watchdog;
63363  
63364  extern int disable_timer_pin_1;
63365  
63366 +#ifndef CONFIG_XEN
63367  void smp_send_timer_broadcast_ipi(struct pt_regs *regs);
63368  void switch_APIC_timer_to_ipi(void *cpumask);
63369  void switch_ipi_to_APIC_timer(void *cpumask);
63370  #define ARCH_APICTIMER_STOPS_ON_C3     1
63371 +#endif
63372  
63373  extern int timer_over_8254;
63374  
63375 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/atomic.h tmp-linux-2.6-xen.patch/include/asm-i386/atomic.h
63376 --- ref-linux-2.6.16.9/include/asm-i386/atomic.h        2006-04-19 08:10:14.000000000 +0200
63377 +++ tmp-linux-2.6-xen.patch/include/asm-i386/atomic.h   2006-04-10 00:05:52.000000000 +0200
63378 @@ -4,18 +4,13 @@
63379  #include <linux/config.h>
63380  #include <linux/compiler.h>
63381  #include <asm/processor.h>
63382 +#include <asm/smp_alt.h>
63383  
63384  /*
63385   * Atomic operations that C can't guarantee us.  Useful for
63386   * resource counting etc..
63387   */
63388  
63389 -#ifdef CONFIG_SMP
63390 -#define LOCK "lock ; "
63391 -#else
63392 -#define LOCK ""
63393 -#endif
63394 -
63395  /*
63396   * Make sure gcc doesn't try to be clever and move things around
63397   * on us. We need to use _exactly_ the address the user gave us,
63398 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/bitops.h tmp-linux-2.6-xen.patch/include/asm-i386/bitops.h
63399 --- ref-linux-2.6.16.9/include/asm-i386/bitops.h        2006-04-19 08:10:14.000000000 +0200
63400 +++ tmp-linux-2.6-xen.patch/include/asm-i386/bitops.h   2006-04-10 00:05:52.000000000 +0200
63401 @@ -7,6 +7,7 @@
63402  
63403  #include <linux/config.h>
63404  #include <linux/compiler.h>
63405 +#include <asm/smp_alt.h>
63406  
63407  /*
63408   * These have to be done with inline assembly: that way the bit-setting
63409 @@ -16,12 +17,6 @@
63410   * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
63411   */
63412  
63413 -#ifdef CONFIG_SMP
63414 -#define LOCK_PREFIX "lock ; "
63415 -#else
63416 -#define LOCK_PREFIX ""
63417 -#endif
63418 -
63419  #define ADDR (*(volatile long *) addr)
63420  
63421  /**
63422 @@ -41,7 +36,7 @@
63423   */
63424  static inline void set_bit(int nr, volatile unsigned long * addr)
63425  {
63426 -       __asm__ __volatile__( LOCK_PREFIX
63427 +       __asm__ __volatile__( LOCK
63428                 "btsl %1,%0"
63429                 :"+m" (ADDR)
63430                 :"Ir" (nr));
63431 @@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
63432   */
63433  static inline void clear_bit(int nr, volatile unsigned long * addr)
63434  {
63435 -       __asm__ __volatile__( LOCK_PREFIX
63436 +       __asm__ __volatile__( LOCK
63437                 "btrl %1,%0"
63438                 :"+m" (ADDR)
63439                 :"Ir" (nr));
63440 @@ -121,7 +116,7 @@ static inline void __change_bit(int nr, 
63441   */
63442  static inline void change_bit(int nr, volatile unsigned long * addr)
63443  {
63444 -       __asm__ __volatile__( LOCK_PREFIX
63445 +       __asm__ __volatile__( LOCK
63446                 "btcl %1,%0"
63447                 :"+m" (ADDR)
63448                 :"Ir" (nr));
63449 @@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
63450  {
63451         int oldbit;
63452  
63453 -       __asm__ __volatile__( LOCK_PREFIX
63454 +       __asm__ __volatile__( LOCK
63455                 "btsl %2,%1\n\tsbbl %0,%0"
63456                 :"=r" (oldbit),"+m" (ADDR)
63457                 :"Ir" (nr) : "memory");
63458 @@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
63459  {
63460         int oldbit;
63461  
63462 -       __asm__ __volatile__( LOCK_PREFIX
63463 +       __asm__ __volatile__( LOCK
63464                 "btrl %2,%1\n\tsbbl %0,%0"
63465                 :"=r" (oldbit),"+m" (ADDR)
63466                 :"Ir" (nr) : "memory");
63467 @@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
63468  {
63469         int oldbit;
63470  
63471 -       __asm__ __volatile__( LOCK_PREFIX
63472 +       __asm__ __volatile__( LOCK
63473                 "btcl %2,%1\n\tsbbl %0,%0"
63474                 :"=r" (oldbit),"+m" (ADDR)
63475                 :"Ir" (nr) : "memory");
63476 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/elf.h tmp-linux-2.6-xen.patch/include/asm-i386/elf.h
63477 --- ref-linux-2.6.16.9/include/asm-i386/elf.h   2006-04-19 08:10:14.000000000 +0200
63478 +++ tmp-linux-2.6-xen.patch/include/asm-i386/elf.h      2006-04-10 00:05:52.000000000 +0200
63479 @@ -129,11 +129,16 @@ extern int dump_task_extended_fpu (struc
63480  #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
63481  #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
63482  
63483 -#define VSYSCALL_BASE  (__fix_to_virt(FIX_VSYSCALL))
63484 +#define VSYSCALL_BASE  (PAGE_OFFSET - 2*PAGE_SIZE)
63485  #define VSYSCALL_EHDR  ((const struct elfhdr *) VSYSCALL_BASE)
63486  #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
63487  extern void __kernel_vsyscall;
63488  
63489 +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
63490 +struct linux_binprm;
63491 +extern int arch_setup_additional_pages(struct linux_binprm *bprm,
63492 +                                       int executable_stack);
63493 +
63494  #define ARCH_DLINFO                                            \
63495  do {                                                           \
63496                 NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY);        \
63497 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/fixmap.h tmp-linux-2.6-xen.patch/include/asm-i386/fixmap.h
63498 --- ref-linux-2.6.16.9/include/asm-i386/fixmap.h        2006-04-19 08:10:14.000000000 +0200
63499 +++ tmp-linux-2.6-xen.patch/include/asm-i386/fixmap.h   2006-04-10 00:05:52.000000000 +0200
63500 @@ -20,7 +20,7 @@
63501   * Leave one empty page between vmalloc'ed areas and
63502   * the start of the fixmap.
63503   */
63504 -#define __FIXADDR_TOP  0xfffff000
63505 +extern unsigned long __FIXADDR_TOP;
63506  
63507  #ifndef __ASSEMBLY__
63508  #include <linux/kernel.h>
63509 @@ -52,7 +52,6 @@
63510   */
63511  enum fixed_addresses {
63512         FIX_HOLE,
63513 -       FIX_VSYSCALL,
63514  #ifdef CONFIG_X86_LOCAL_APIC
63515         FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
63516  #endif
63517 @@ -95,6 +94,8 @@ enum fixed_addresses {
63518  extern void __set_fixmap (enum fixed_addresses idx,
63519                                         unsigned long phys, pgprot_t flags);
63520  
63521 +extern void set_fixaddr_top(unsigned long top);
63522 +
63523  #define set_fixmap(idx, phys) \
63524                 __set_fixmap(idx, phys, PAGE_KERNEL)
63525  /*
63526 @@ -116,14 +117,6 @@ extern void __set_fixmap (enum fixed_add
63527  #define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
63528  #define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
63529  
63530 -/*
63531 - * This is the range that is readable by user mode, and things
63532 - * acting like user mode such as get_user_pages.
63533 - */
63534 -#define FIXADDR_USER_START     (__fix_to_virt(FIX_VSYSCALL))
63535 -#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
63536 -
63537 -
63538  extern void __this_fixmap_does_not_exist(void);
63539  
63540  /*
63541 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/futex.h tmp-linux-2.6-xen.patch/include/asm-i386/futex.h
63542 --- ref-linux-2.6.16.9/include/asm-i386/futex.h 2006-04-19 08:10:14.000000000 +0200
63543 +++ tmp-linux-2.6-xen.patch/include/asm-i386/futex.h    2006-04-10 00:05:52.000000000 +0200
63544 @@ -28,7 +28,7 @@
63545  "1:    movl    %2, %0\n\
63546         movl    %0, %3\n"                                       \
63547         insn "\n"                                               \
63548 -"2:    " LOCK_PREFIX "cmpxchgl %3, %2\n\
63549 +"2:    " LOCK "cmpxchgl %3, %2\n\
63550         jnz     1b\n\
63551  3:     .section .fixup,\"ax\"\n\
63552  4:     mov     %5, %1\n\
63553 @@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, 
63554  #endif
63555                 switch (op) {
63556                 case FUTEX_OP_ADD:
63557 -                       __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
63558 +                       __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
63559                                            oldval, uaddr, oparg);
63560                         break;
63561                 case FUTEX_OP_OR:
63562 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-default/mach_traps.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-default/mach_traps.h
63563 --- ref-linux-2.6.16.9/include/asm-i386/mach-default/mach_traps.h       2006-04-19 08:10:14.000000000 +0200
63564 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-default/mach_traps.h  2006-04-10 00:05:52.000000000 +0200
63565 @@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
63566         outb(reason, 0x61);
63567  }
63568  
63569 +static inline void clear_io_check_error(unsigned char reason)
63570 +{
63571 +       unsigned long i;
63572 +
63573 +       reason = (reason & 0xf) | 8;
63574 +       outb(reason, 0x61);
63575 +       i = 2000;
63576 +       while (--i) udelay(1000);
63577 +       reason &= ~8;
63578 +       outb(reason, 0x61);
63579 +}
63580 +
63581  static inline unsigned char get_nmi_reason(void)
63582  {
63583         return inb(0x61);
63584 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/agp.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/agp.h
63585 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/agp.h      1970-01-01 01:00:00.000000000 +0100
63586 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/agp.h 2006-04-10 00:05:52.000000000 +0200
63587 @@ -0,0 +1,37 @@
63588 +#ifndef AGP_H
63589 +#define AGP_H 1
63590 +
63591 +#include <asm/pgtable.h>
63592 +#include <asm/cacheflush.h>
63593 +#include <asm/system.h>
63594 +
63595 +/* 
63596 + * Functions to keep the agpgart mappings coherent with the MMU.
63597 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
63598 + * mapped uncacheable. Make sure there are no conflicting mappings
63599 + * with different cachability attributes for the same page. This avoids
63600 + * data corruption on some CPUs.
63601 + */
63602 +
63603 +int map_page_into_agp(struct page *page);
63604 +int unmap_page_from_agp(struct page *page);
63605 +#define flush_agp_mappings() global_flush_tlb()
63606 +
63607 +/* Could use CLFLUSH here if the cpu supports it. But then it would
63608 +   need to be called for each cacheline of the whole page so it may not be 
63609 +   worth it. Would need a page for it. */
63610 +#define flush_agp_cache() wbinvd()
63611 +
63612 +/* Convert a physical address to an address suitable for the GART. */
63613 +#define phys_to_gart(x) phys_to_machine(x)
63614 +#define gart_to_phys(x) machine_to_phys(x)
63615 +
63616 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
63617 +#define alloc_gatt_pages(order)        ({                                          \
63618 +       char *_t; dma_addr_t _d;                                            \
63619 +       _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
63620 +       _t; })
63621 +#define free_gatt_pages(table, order)  \
63622 +       dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
63623 +
63624 +#endif
63625 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/desc.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/desc.h
63626 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/desc.h     1970-01-01 01:00:00.000000000 +0100
63627 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/desc.h        2006-04-10 00:05:52.000000000 +0200
63628 @@ -0,0 +1,164 @@
63629 +#ifndef __ARCH_DESC_H
63630 +#define __ARCH_DESC_H
63631 +
63632 +#include <asm/ldt.h>
63633 +#include <asm/segment.h>
63634 +
63635 +#define CPU_16BIT_STACK_SIZE 1024
63636 +
63637 +#ifndef __ASSEMBLY__
63638 +
63639 +#include <linux/preempt.h>
63640 +#include <linux/smp.h>
63641 +
63642 +#include <asm/mmu.h>
63643 +
63644 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
63645 +
63646 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
63647 +
63648 +struct Xgt_desc_struct {
63649 +       unsigned short size;
63650 +       unsigned long address __attribute__((packed));
63651 +       unsigned short pad;
63652 +} __attribute__ ((packed));
63653 +
63654 +extern struct Xgt_desc_struct idt_descr;
63655 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
63656 +
63657 +
63658 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
63659 +{
63660 +       return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
63661 +}
63662 +
63663 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
63664 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
63665 +
63666 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
63667 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
63668 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
63669 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
63670 +
63671 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
63672 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
63673 +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
63674 +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
63675 +
63676 +/*
63677 + * This is the ldt that every process will get unless we need
63678 + * something other than this.
63679 + */
63680 +extern struct desc_struct default_ldt[];
63681 +extern void set_intr_gate(unsigned int irq, void * addr);
63682 +
63683 +#define _set_tssldt_desc(n,addr,limit,type) \
63684 +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
63685 +       "movw %w1,2(%2)\n\t" \
63686 +       "rorl $16,%1\n\t" \
63687 +       "movb %b1,4(%2)\n\t" \
63688 +       "movb %4,5(%2)\n\t" \
63689 +       "movb $0,6(%2)\n\t" \
63690 +       "movb %h1,7(%2)\n\t" \
63691 +       "rorl $16,%1" \
63692 +       : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
63693 +
63694 +#ifndef CONFIG_X86_NO_TSS
63695 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
63696 +{
63697 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
63698 +               offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
63699 +}
63700 +
63701 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
63702 +#endif
63703 +
63704 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
63705 +{
63706 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
63707 +}
63708 +
63709 +#define LDT_entry_a(info) \
63710 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
63711 +
63712 +#define LDT_entry_b(info) \
63713 +       (((info)->base_addr & 0xff000000) | \
63714 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
63715 +       ((info)->limit & 0xf0000) | \
63716 +       (((info)->read_exec_only ^ 1) << 9) | \
63717 +       ((info)->contents << 10) | \
63718 +       (((info)->seg_not_present ^ 1) << 15) | \
63719 +       ((info)->seg_32bit << 22) | \
63720 +       ((info)->limit_in_pages << 23) | \
63721 +       ((info)->useable << 20) | \
63722 +       0x7000)
63723 +
63724 +#define LDT_empty(info) (\
63725 +       (info)->base_addr       == 0    && \
63726 +       (info)->limit           == 0    && \
63727 +       (info)->contents        == 0    && \
63728 +       (info)->read_exec_only  == 1    && \
63729 +       (info)->seg_32bit       == 0    && \
63730 +       (info)->limit_in_pages  == 0    && \
63731 +       (info)->seg_not_present == 1    && \
63732 +       (info)->useable         == 0    )
63733 +
63734 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
63735 +
63736 +#if TLS_SIZE != 24
63737 +# error update this code.
63738 +#endif
63739 +
63740 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
63741 +{
63742 +#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
63743 +       C(0); C(1); C(2);
63744 +#undef C
63745 +}
63746 +
63747 +static inline void clear_LDT(void)
63748 +{
63749 +       int cpu = get_cpu();
63750 +
63751 +       /*
63752 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
63753 +        * it slows down context switching. Noone uses it anyway.
63754 +        */
63755 +       cpu = cpu;              /* XXX avoid compiler warning */
63756 +       xen_set_ldt(0UL, 0);
63757 +       put_cpu();
63758 +}
63759 +
63760 +/*
63761 + * load one particular LDT into the current CPU
63762 + */
63763 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
63764 +{
63765 +       void *segments = pc->ldt;
63766 +       int count = pc->size;
63767 +
63768 +       if (likely(!count))
63769 +               segments = NULL;
63770 +
63771 +       xen_set_ldt((unsigned long)segments, count);
63772 +}
63773 +
63774 +static inline void load_LDT(mm_context_t *pc)
63775 +{
63776 +       int cpu = get_cpu();
63777 +       load_LDT_nolock(pc, cpu);
63778 +       put_cpu();
63779 +}
63780 +
63781 +static inline unsigned long get_desc_base(unsigned long *desc)
63782 +{
63783 +       unsigned long base;
63784 +       base = ((desc[0] >> 16)  & 0x0000ffff) |
63785 +               ((desc[1] << 16) & 0x00ff0000) |
63786 +               (desc[1] & 0xff000000);
63787 +       return base;
63788 +}
63789 +
63790 +#endif /* !__ASSEMBLY__ */
63791 +
63792 +#endif
63793 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/dma-mapping.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/dma-mapping.h
63794 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/dma-mapping.h      1970-01-01 01:00:00.000000000 +0100
63795 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/dma-mapping.h 2006-04-10 00:05:52.000000000 +0200
63796 @@ -0,0 +1,156 @@
63797 +#ifndef _ASM_I386_DMA_MAPPING_H
63798 +#define _ASM_I386_DMA_MAPPING_H
63799 +
63800 +/*
63801 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
63802 + * documentation.
63803 + */
63804 +
63805 +#include <linux/config.h>
63806 +#include <linux/mm.h>
63807 +#include <asm/cache.h>
63808 +#include <asm/io.h>
63809 +#include <asm/scatterlist.h>
63810 +#include <asm/swiotlb.h>
63811 +
63812 +static inline int
63813 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
63814 +{
63815 +       dma_addr_t mask = 0xffffffff;
63816 +       /* If the device has a mask, use it, otherwise default to 32 bits */
63817 +       if (hwdev && hwdev->dma_mask)
63818 +               mask = *hwdev->dma_mask;
63819 +       return (addr & ~mask) != 0;
63820 +}
63821 +
63822 +static inline int
63823 +range_straddles_page_boundary(void *p, size_t size)
63824 +{
63825 +       extern unsigned long *contiguous_bitmap;
63826 +       return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
63827 +               !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
63828 +}
63829 +
63830 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
63831 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
63832 +
63833 +void *dma_alloc_coherent(struct device *dev, size_t size,
63834 +                          dma_addr_t *dma_handle, gfp_t flag);
63835 +
63836 +void dma_free_coherent(struct device *dev, size_t size,
63837 +                        void *vaddr, dma_addr_t dma_handle);
63838 +
63839 +extern dma_addr_t
63840 +dma_map_single(struct device *dev, void *ptr, size_t size,
63841 +              enum dma_data_direction direction);
63842 +
63843 +extern void
63844 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
63845 +                enum dma_data_direction direction);
63846 +
63847 +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
63848 +                     int nents, enum dma_data_direction direction);
63849 +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
63850 +                        int nents, enum dma_data_direction direction);
63851 +
63852 +extern dma_addr_t
63853 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
63854 +            size_t size, enum dma_data_direction direction);
63855 +
63856 +extern void
63857 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
63858 +              enum dma_data_direction direction);
63859 +
63860 +extern void
63861 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
63862 +                       enum dma_data_direction direction);
63863 +
63864 +extern void
63865 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
63866 +                           enum dma_data_direction direction);
63867 +
63868 +static inline void
63869 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
63870 +                             unsigned long offset, size_t size,
63871 +                             enum dma_data_direction direction)
63872 +{
63873 +       dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
63874 +}
63875 +
63876 +static inline void
63877 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
63878 +                                unsigned long offset, size_t size,
63879 +                                enum dma_data_direction direction)
63880 +{
63881 +       dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
63882 +}
63883 +
63884 +static inline void
63885 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
63886 +                   enum dma_data_direction direction)
63887 +{
63888 +       if (swiotlb)
63889 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
63890 +       flush_write_buffers();
63891 +}
63892 +
63893 +static inline void
63894 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
63895 +                   enum dma_data_direction direction)
63896 +{
63897 +       if (swiotlb)
63898 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
63899 +       flush_write_buffers();
63900 +}
63901 +
63902 +extern int
63903 +dma_mapping_error(dma_addr_t dma_addr);
63904 +
63905 +extern int
63906 +dma_supported(struct device *dev, u64 mask);
63907 +
63908 +static inline int
63909 +dma_set_mask(struct device *dev, u64 mask)
63910 +{
63911 +       if(!dev->dma_mask || !dma_supported(dev, mask))
63912 +               return -EIO;
63913 +
63914 +       *dev->dma_mask = mask;
63915 +
63916 +       return 0;
63917 +}
63918 +
63919 +#ifdef __i386__
63920 +static inline int
63921 +dma_get_cache_alignment(void)
63922 +{
63923 +       /* no easy way to get cache size on all x86, so return the
63924 +        * maximum possible, to be safe */
63925 +       return (1 << INTERNODE_CACHE_SHIFT);
63926 +}
63927 +#else
63928 +extern int dma_get_cache_alignment(void);
63929 +#endif
63930 +
63931 +#define dma_is_consistent(d)   (1)
63932 +
63933 +static inline void
63934 +dma_cache_sync(void *vaddr, size_t size,
63935 +              enum dma_data_direction direction)
63936 +{
63937 +       flush_write_buffers();
63938 +}
63939 +
63940 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
63941 +extern int
63942 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
63943 +                           dma_addr_t device_addr, size_t size, int flags);
63944 +
63945 +extern void
63946 +dma_release_declared_memory(struct device *dev);
63947 +
63948 +extern void *
63949 +dma_mark_declared_memory_occupied(struct device *dev,
63950 +                                 dma_addr_t device_addr, size_t size);
63951 +
63952 +#endif
63953 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/fixmap.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/fixmap.h
63954 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/fixmap.h   1970-01-01 01:00:00.000000000 +0100
63955 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/fixmap.h      2006-04-10 00:05:52.000000000 +0200
63956 @@ -0,0 +1,156 @@
63957 +/*
63958 + * fixmap.h: compile-time virtual memory allocation
63959 + *
63960 + * This file is subject to the terms and conditions of the GNU General Public
63961 + * License.  See the file "COPYING" in the main directory of this archive
63962 + * for more details.
63963 + *
63964 + * Copyright (C) 1998 Ingo Molnar
63965 + *
63966 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
63967 + */
63968 +
63969 +#ifndef _ASM_FIXMAP_H
63970 +#define _ASM_FIXMAP_H
63971 +
63972 +#include <linux/config.h>
63973 +
63974 +/* used by vmalloc.c, vsyscall.lds.S.
63975 + *
63976 + * Leave one empty page between vmalloc'ed areas and
63977 + * the start of the fixmap.
63978 + */
63979 +extern unsigned long __FIXADDR_TOP;
63980 +
63981 +#ifndef __ASSEMBLY__
63982 +#include <linux/kernel.h>
63983 +#include <asm/acpi.h>
63984 +#include <asm/apicdef.h>
63985 +#include <asm/page.h>
63986 +#include <xen/gnttab.h>
63987 +#ifdef CONFIG_HIGHMEM
63988 +#include <linux/threads.h>
63989 +#include <asm/kmap_types.h>
63990 +#endif
63991 +
63992 +/*
63993 + * Here we define all the compile-time 'special' virtual
63994 + * addresses. The point is to have a constant address at
63995 + * compile time, but to set the physical address only
63996 + * in the boot process. We allocate these special addresses
63997 + * from the end of virtual memory (0xfffff000) backwards.
63998 + * Also this lets us do fail-safe vmalloc(), we
63999 + * can guarantee that these special addresses and
64000 + * vmalloc()-ed addresses never overlap.
64001 + *
64002 + * these 'compile-time allocated' memory buffers are
64003 + * fixed-size 4k pages. (or larger if used with an increment
64004 + * highger than 1) use fixmap_set(idx,phys) to associate
64005 + * physical memory with fixmap indices.
64006 + *
64007 + * TLB entries of such buffers will not be flushed across
64008 + * task switches.
64009 + */
64010 +enum fixed_addresses {
64011 +       FIX_HOLE,
64012 +#ifdef CONFIG_X86_LOCAL_APIC
64013 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
64014 +#endif
64015 +#ifdef CONFIG_X86_IO_APIC
64016 +       FIX_IO_APIC_BASE_0,
64017 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
64018 +#endif
64019 +#ifdef CONFIG_X86_VISWS_APIC
64020 +       FIX_CO_CPU,     /* Cobalt timer */
64021 +       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */ 
64022 +       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
64023 +       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
64024 +#endif
64025 +#ifdef CONFIG_X86_F00F_BUG
64026 +       FIX_F00F_IDT,   /* Virtual mapping for IDT */
64027 +#endif
64028 +#ifdef CONFIG_X86_CYCLONE_TIMER
64029 +       FIX_CYCLONE_TIMER, /*cyclone timer register*/
64030 +#endif 
64031 +#ifdef CONFIG_HIGHMEM
64032 +       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
64033 +       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
64034 +#endif
64035 +#ifdef CONFIG_ACPI
64036 +       FIX_ACPI_BEGIN,
64037 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
64038 +#endif
64039 +#ifdef CONFIG_PCI_MMCONFIG
64040 +       FIX_PCIE_MCFG,
64041 +#endif
64042 +       FIX_SHARED_INFO,
64043 +#define NR_FIX_ISAMAPS 256
64044 +       FIX_ISAMAP_END,
64045 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
64046 +       __end_of_permanent_fixed_addresses,
64047 +       /* temporary boot-time mappings, used before ioremap() is functional */
64048 +#define NR_FIX_BTMAPS  16
64049 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
64050 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
64051 +       FIX_WP_TEST,
64052 +       __end_of_fixed_addresses
64053 +};
64054 +
64055 +extern void __set_fixmap(enum fixed_addresses idx,
64056 +                                       maddr_t phys, pgprot_t flags);
64057 +
64058 +extern void set_fixaddr_top(unsigned long top);
64059 +
64060 +#define set_fixmap(idx, phys) \
64061 +               __set_fixmap(idx, phys, PAGE_KERNEL)
64062 +/*
64063 + * Some hardware wants to get fixmapped without caching.
64064 + */
64065 +#define set_fixmap_nocache(idx, phys) \
64066 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
64067 +
64068 +#define clear_fixmap(idx) \
64069 +               __set_fixmap(idx, 0, __pgprot(0))
64070 +
64071 +#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
64072 +
64073 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
64074 +#define __FIXADDR_BOOT_SIZE    (__end_of_fixed_addresses << PAGE_SHIFT)
64075 +#define FIXADDR_START          (FIXADDR_TOP - __FIXADDR_SIZE)
64076 +#define FIXADDR_BOOT_START     (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
64077 +
64078 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
64079 +#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
64080 +
64081 +extern void __this_fixmap_does_not_exist(void);
64082 +
64083 +/*
64084 + * 'index to address' translation. If anyone tries to use the idx
64085 + * directly without tranlation, we catch the bug with a NULL-deference
64086 + * kernel oops. Illegal ranges of incoming indices are caught too.
64087 + */
64088 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
64089 +{
64090 +       /*
64091 +        * this branch gets completely eliminated after inlining,
64092 +        * except when someone tries to use fixaddr indices in an
64093 +        * illegal way. (such as mixing up address types or using
64094 +        * out-of-range indices).
64095 +        *
64096 +        * If it doesn't get removed, the linker will complain
64097 +        * loudly with a reasonably clear error message..
64098 +        */
64099 +       if (idx >= __end_of_fixed_addresses)
64100 +               __this_fixmap_does_not_exist();
64101 +
64102 +        return __fix_to_virt(idx);
64103 +}
64104 +
64105 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
64106 +{
64107 +       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
64108 +       return __virt_to_fix(vaddr);
64109 +}
64110 +
64111 +#endif /* !__ASSEMBLY__ */
64112 +#endif
64113 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/floppy.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/floppy.h
64114 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/floppy.h   1970-01-01 01:00:00.000000000 +0100
64115 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/floppy.h      2006-04-10 00:05:52.000000000 +0200
64116 @@ -0,0 +1,147 @@
64117 +/*
64118 + * Architecture specific parts of the Floppy driver
64119 + *
64120 + * This file is subject to the terms and conditions of the GNU General Public
64121 + * License.  See the file "COPYING" in the main directory of this archive
64122 + * for more details.
64123 + *
64124 + * Copyright (C) 1995
64125 + *
64126 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
64127 + */
64128 +#ifndef __ASM_XEN_I386_FLOPPY_H
64129 +#define __ASM_XEN_I386_FLOPPY_H
64130 +
64131 +#include <linux/vmalloc.h>
64132 +
64133 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
64134 +#include <asm/dma.h>
64135 +#undef MAX_DMA_ADDRESS
64136 +#define MAX_DMA_ADDRESS 0
64137 +#define CROSS_64KB(a,s) (0)
64138 +
64139 +#define fd_inb(port)                   inb_p(port)
64140 +#define fd_outb(value,port)            outb_p(value,port)
64141 +
64142 +#define fd_request_dma()        (0)
64143 +#define fd_free_dma()           ((void)0)
64144 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
64145 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
64146 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
64147 +#define fd_get_dma_residue()    (virtual_dma_count + virtual_dma_residue)
64148 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
64149 +/*
64150 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
64151 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
64152 + */
64153 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
64154 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
64155 +
64156 +static int virtual_dma_count;
64157 +static int virtual_dma_residue;
64158 +static char *virtual_dma_addr;
64159 +static int virtual_dma_mode;
64160 +static int doing_pdma;
64161 +
64162 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
64163 +{
64164 +       register unsigned char st;
64165 +       register int lcount;
64166 +       register char *lptr;
64167 +
64168 +       if (!doing_pdma)
64169 +               return floppy_interrupt(irq, dev_id, regs);
64170 +
64171 +       st = 1;
64172 +       for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
64173 +           lcount; lcount--, lptr++) {
64174 +               st=inb(virtual_dma_port+4) & 0xa0 ;
64175 +               if(st != 0xa0) 
64176 +                       break;
64177 +               if(virtual_dma_mode)
64178 +                       outb_p(*lptr, virtual_dma_port+5);
64179 +               else
64180 +                       *lptr = inb_p(virtual_dma_port+5);
64181 +       }
64182 +       virtual_dma_count = lcount;
64183 +       virtual_dma_addr = lptr;
64184 +       st = inb(virtual_dma_port+4);
64185 +
64186 +       if(st == 0x20)
64187 +               return IRQ_HANDLED;
64188 +       if(!(st & 0x20)) {
64189 +               virtual_dma_residue += virtual_dma_count;
64190 +               virtual_dma_count=0;
64191 +               doing_pdma = 0;
64192 +               floppy_interrupt(irq, dev_id, regs);
64193 +               return IRQ_HANDLED;
64194 +       }
64195 +       return IRQ_HANDLED;
64196 +}
64197 +
64198 +static void fd_disable_dma(void)
64199 +{
64200 +       doing_pdma = 0;
64201 +       virtual_dma_residue += virtual_dma_count;
64202 +       virtual_dma_count=0;
64203 +}
64204 +
64205 +static int fd_request_irq(void)
64206 +{
64207 +       return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
64208 +                                          "floppy", NULL);
64209 +}
64210 +
64211 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
64212 +{
64213 +       doing_pdma = 1;
64214 +       virtual_dma_port = io;
64215 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
64216 +       virtual_dma_addr = addr;
64217 +       virtual_dma_count = size;
64218 +       virtual_dma_residue = 0;
64219 +       return 0;
64220 +}
64221 +
64222 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
64223 +#define FDC1 xen_floppy_init()
64224 +static int FDC2 = -1;
64225 +
64226 +static int xen_floppy_init(void)
64227 +{
64228 +       use_virtual_dma = 1;
64229 +       can_use_virtual_dma = 1;
64230 +       return 0x3f0;
64231 +}
64232 +
64233 +/*
64234 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
64235 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
64236 + * coincides with another rtc CMOS user.               Paul G.
64237 + */
64238 +#define FLOPPY0_TYPE   ({                              \
64239 +       unsigned long flags;                            \
64240 +       unsigned char val;                              \
64241 +       spin_lock_irqsave(&rtc_lock, flags);            \
64242 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
64243 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
64244 +       val;                                            \
64245 +})
64246 +
64247 +#define FLOPPY1_TYPE   ({                              \
64248 +       unsigned long flags;                            \
64249 +       unsigned char val;                              \
64250 +       spin_lock_irqsave(&rtc_lock, flags);            \
64251 +       val = CMOS_READ(0x10) & 15;                     \
64252 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
64253 +       val;                                            \
64254 +})
64255 +
64256 +#define N_FDC 2
64257 +#define N_DRIVE 8
64258 +
64259 +#define FLOPPY_MOTOR_MASK 0xf0
64260 +
64261 +#define EXTRA_FLOPPY_PARAMS
64262 +
64263 +#endif /* __ASM_XEN_I386_FLOPPY_H */
64264 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/highmem.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/highmem.h
64265 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/highmem.h  1970-01-01 01:00:00.000000000 +0100
64266 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/highmem.h     2006-04-10 00:05:52.000000000 +0200
64267 @@ -0,0 +1,81 @@
64268 +/*
64269 + * highmem.h: virtual kernel memory mappings for high memory
64270 + *
64271 + * Used in CONFIG_HIGHMEM systems for memory pages which
64272 + * are not addressable by direct kernel virtual addresses.
64273 + *
64274 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
64275 + *                   Gerhard.Wichert@pdb.siemens.de
64276 + *
64277 + *
64278 + * Redesigned the x86 32-bit VM architecture to deal with 
64279 + * up to 16 Terabyte physical memory. With current x86 CPUs
64280 + * we now support up to 64 Gigabytes physical RAM.
64281 + *
64282 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
64283 + */
64284 +
64285 +#ifndef _ASM_HIGHMEM_H
64286 +#define _ASM_HIGHMEM_H
64287 +
64288 +#ifdef __KERNEL__
64289 +
64290 +#include <linux/config.h>
64291 +#include <linux/interrupt.h>
64292 +#include <linux/threads.h>
64293 +#include <asm/kmap_types.h>
64294 +#include <asm/tlbflush.h>
64295 +
64296 +/* declarations for highmem.c */
64297 +extern unsigned long highstart_pfn, highend_pfn;
64298 +
64299 +extern pte_t *kmap_pte;
64300 +extern pgprot_t kmap_prot;
64301 +extern pte_t *pkmap_page_table;
64302 +
64303 +/*
64304 + * Right now we initialize only a single pte table. It can be extended
64305 + * easily, subsequent pte tables have to be allocated in one physical
64306 + * chunk of RAM.
64307 + */
64308 +#ifdef CONFIG_X86_PAE
64309 +#define LAST_PKMAP 512
64310 +#else
64311 +#define LAST_PKMAP 1024
64312 +#endif
64313 +/*
64314 + * Ordering is:
64315 + *
64316 + * FIXADDR_TOP
64317 + *                     fixed_addresses
64318 + * FIXADDR_START
64319 + *                     temp fixed addresses
64320 + * FIXADDR_BOOT_START
64321 + *                     Persistent kmap area
64322 + * PKMAP_BASE
64323 + * VMALLOC_END
64324 + *                     Vmalloc area
64325 + * VMALLOC_START
64326 + * high_memory
64327 + */
64328 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
64329 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
64330 +#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
64331 +#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
64332 +
64333 +extern void * FASTCALL(kmap_high(struct page *page));
64334 +extern void FASTCALL(kunmap_high(struct page *page));
64335 +
64336 +void *kmap(struct page *page);
64337 +void kunmap(struct page *page);
64338 +void *kmap_atomic(struct page *page, enum km_type type);
64339 +void *kmap_atomic_pte(struct page *page, enum km_type type);
64340 +void kunmap_atomic(void *kvaddr, enum km_type type);
64341 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
64342 +struct page *kmap_atomic_to_page(void *ptr);
64343 +
64344 +#define flush_cache_kmaps()    do { } while (0)
64345 +
64346 +#endif /* __KERNEL__ */
64347 +
64348 +#endif /* _ASM_HIGHMEM_H */
64349 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/hw_irq.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/hw_irq.h
64350 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/hw_irq.h   1970-01-01 01:00:00.000000000 +0100
64351 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/hw_irq.h      2006-04-10 00:05:52.000000000 +0200
64352 @@ -0,0 +1,73 @@
64353 +#ifndef _ASM_HW_IRQ_H
64354 +#define _ASM_HW_IRQ_H
64355 +
64356 +/*
64357 + *     linux/include/asm/hw_irq.h
64358 + *
64359 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
64360 + *
64361 + *     moved some of the old arch/i386/kernel/irq.h to here. VY
64362 + *
64363 + *     IRQ/IPI changes taken from work by Thomas Radke
64364 + *     <tomsoft@informatik.tu-chemnitz.de>
64365 + */
64366 +
64367 +#include <linux/config.h>
64368 +#include <linux/profile.h>
64369 +#include <asm/atomic.h>
64370 +#include <asm/irq.h>
64371 +#include <asm/sections.h>
64372 +
64373 +struct hw_interrupt_type;
64374 +
64375 +/*
64376 + * Various low-level irq details needed by irq.c, process.c,
64377 + * time.c, io_apic.c and smp.c
64378 + *
64379 + * Interrupt entry/exit code at both C and assembly level
64380 + */
64381 +
64382 +extern u8 irq_vector[NR_IRQ_VECTORS];
64383 +#define IO_APIC_VECTOR(irq)    (irq_vector[irq])
64384 +#define AUTO_ASSIGN            -1
64385 +
64386 +extern void (*interrupt[NR_IRQS])(void);
64387 +
64388 +#ifdef CONFIG_SMP
64389 +fastcall void reschedule_interrupt(void);
64390 +fastcall void invalidate_interrupt(void);
64391 +fastcall void call_function_interrupt(void);
64392 +#endif
64393 +
64394 +#ifdef CONFIG_X86_LOCAL_APIC
64395 +fastcall void apic_timer_interrupt(void);
64396 +fastcall void error_interrupt(void);
64397 +fastcall void spurious_interrupt(void);
64398 +fastcall void thermal_interrupt(struct pt_regs *);
64399 +#define platform_legacy_irq(irq)       ((irq) < 16)
64400 +#endif
64401 +
64402 +void disable_8259A_irq(unsigned int irq);
64403 +void enable_8259A_irq(unsigned int irq);
64404 +int i8259A_irq_pending(unsigned int irq);
64405 +void make_8259A_irq(unsigned int irq);
64406 +void init_8259A(int aeoi);
64407 +void FASTCALL(send_IPI_self(int vector));
64408 +void init_VISWS_APIC_irqs(void);
64409 +void setup_IO_APIC(void);
64410 +void disable_IO_APIC(void);
64411 +void print_IO_APIC(void);
64412 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
64413 +void send_IPI(int dest, int vector);
64414 +void setup_ioapic_dest(void);
64415 +
64416 +extern unsigned long io_apic_irqs;
64417 +
64418 +extern atomic_t irq_err_count;
64419 +extern atomic_t irq_mis_count;
64420 +
64421 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
64422 +
64423 +extern void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i);
64424 +
64425 +#endif /* _ASM_HW_IRQ_H */
64426 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/hypercall.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/hypercall.h
64427 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/hypercall.h        1970-01-01 01:00:00.000000000 +0100
64428 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/hypercall.h   2006-04-10 00:05:52.000000000 +0200
64429 @@ -0,0 +1,342 @@
64430 +/******************************************************************************
64431 + * hypercall.h
64432 + * 
64433 + * Linux-specific hypervisor handling.
64434 + * 
64435 + * Copyright (c) 2002-2004, K A Fraser
64436 + * 
64437 + * This program is free software; you can redistribute it and/or
64438 + * modify it under the terms of the GNU General Public License version 2
64439 + * as published by the Free Software Foundation; or, when distributed
64440 + * separately from the Linux kernel or incorporated into other
64441 + * software packages, subject to the following license:
64442 + * 
64443 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64444 + * of this source file (the "Software"), to deal in the Software without
64445 + * restriction, including without limitation the rights to use, copy, modify,
64446 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64447 + * and to permit persons to whom the Software is furnished to do so, subject to
64448 + * the following conditions:
64449 + * 
64450 + * The above copyright notice and this permission notice shall be included in
64451 + * all copies or substantial portions of the Software.
64452 + * 
64453 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64454 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64455 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64456 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64457 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64458 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64459 + * IN THE SOFTWARE.
64460 + */
64461 +
64462 +#ifndef __HYPERCALL_H__
64463 +#define __HYPERCALL_H__
64464 +
64465 +#ifndef __HYPERVISOR_H__
64466 +# error "please don't include this file directly"
64467 +#endif
64468 +
64469 +#define __STR(x) #x
64470 +#define STR(x) __STR(x)
64471 +
64472 +#define _hypercall0(type, name)                        \
64473 +({                                             \
64474 +       long __res;                             \
64475 +       asm volatile (                          \
64476 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64477 +               : "=a" (__res)                  \
64478 +               :                               \
64479 +               : "memory" );                   \
64480 +       (type)__res;                            \
64481 +})
64482 +
64483 +#define _hypercall1(type, name, a1)                            \
64484 +({                                                             \
64485 +       long __res, __ign1;                                     \
64486 +       asm volatile (                                          \
64487 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64488 +               : "=a" (__res), "=b" (__ign1)                   \
64489 +               : "1" ((long)(a1))                              \
64490 +               : "memory" );                                   \
64491 +       (type)__res;                                            \
64492 +})
64493 +
64494 +#define _hypercall2(type, name, a1, a2)                                \
64495 +({                                                             \
64496 +       long __res, __ign1, __ign2;                             \
64497 +       asm volatile (                                          \
64498 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64499 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2)    \
64500 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
64501 +               : "memory" );                                   \
64502 +       (type)__res;                                            \
64503 +})
64504 +
64505 +#define _hypercall3(type, name, a1, a2, a3)                    \
64506 +({                                                             \
64507 +       long __res, __ign1, __ign2, __ign3;                     \
64508 +       asm volatile (                                          \
64509 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64510 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
64511 +               "=d" (__ign3)                                   \
64512 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
64513 +               "3" ((long)(a3))                                \
64514 +               : "memory" );                                   \
64515 +       (type)__res;                                            \
64516 +})
64517 +
64518 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
64519 +({                                                             \
64520 +       long __res, __ign1, __ign2, __ign3, __ign4;             \
64521 +       asm volatile (                                          \
64522 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64523 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
64524 +               "=d" (__ign3), "=S" (__ign4)                    \
64525 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
64526 +               "3" ((long)(a3)), "4" ((long)(a4))              \
64527 +               : "memory" );                                   \
64528 +       (type)__res;                                            \
64529 +})
64530 +
64531 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
64532 +({                                                             \
64533 +       long __res, __ign1, __ign2, __ign3, __ign4, __ign5;     \
64534 +       asm volatile (                                          \
64535 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64536 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
64537 +               "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)     \
64538 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
64539 +               "3" ((long)(a3)), "4" ((long)(a4)),             \
64540 +               "5" ((long)(a5))                                \
64541 +               : "memory" );                                   \
64542 +       (type)__res;                                            \
64543 +})
64544 +
64545 +static inline int
64546 +HYPERVISOR_set_trap_table(
64547 +       trap_info_t *table)
64548 +{
64549 +       return _hypercall1(int, set_trap_table, table);
64550 +}
64551 +
64552 +static inline int
64553 +HYPERVISOR_mmu_update(
64554 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
64555 +{
64556 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
64557 +}
64558 +
64559 +static inline int
64560 +HYPERVISOR_mmuext_op(
64561 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
64562 +{
64563 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
64564 +}
64565 +
64566 +static inline int
64567 +HYPERVISOR_set_gdt(
64568 +       unsigned long *frame_list, int entries)
64569 +{
64570 +       return _hypercall2(int, set_gdt, frame_list, entries);
64571 +}
64572 +
64573 +static inline int
64574 +HYPERVISOR_stack_switch(
64575 +       unsigned long ss, unsigned long esp)
64576 +{
64577 +       return _hypercall2(int, stack_switch, ss, esp);
64578 +}
64579 +
64580 +static inline int
64581 +HYPERVISOR_set_callbacks(
64582 +       unsigned long event_selector, unsigned long event_address,
64583 +       unsigned long failsafe_selector, unsigned long failsafe_address)
64584 +{
64585 +       return _hypercall4(int, set_callbacks,
64586 +                          event_selector, event_address,
64587 +                          failsafe_selector, failsafe_address);
64588 +}
64589 +
64590 +static inline int
64591 +HYPERVISOR_fpu_taskswitch(
64592 +       int set)
64593 +{
64594 +       return _hypercall1(int, fpu_taskswitch, set);
64595 +}
64596 +
64597 +static inline int
64598 +HYPERVISOR_sched_op_compat(
64599 +       int cmd, unsigned long arg)
64600 +{
64601 +       return _hypercall2(int, sched_op_compat, cmd, arg);
64602 +}
64603 +
64604 +static inline int
64605 +HYPERVISOR_sched_op(
64606 +       int cmd, void *arg)
64607 +{
64608 +       return _hypercall2(int, sched_op, cmd, arg);
64609 +}
64610 +
64611 +static inline long
64612 +HYPERVISOR_set_timer_op(
64613 +       u64 timeout)
64614 +{
64615 +       unsigned long timeout_hi = (unsigned long)(timeout>>32);
64616 +       unsigned long timeout_lo = (unsigned long)timeout;
64617 +       return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
64618 +}
64619 +
64620 +static inline int
64621 +HYPERVISOR_dom0_op(
64622 +       dom0_op_t *dom0_op)
64623 +{
64624 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
64625 +       return _hypercall1(int, dom0_op, dom0_op);
64626 +}
64627 +
64628 +static inline int
64629 +HYPERVISOR_set_debugreg(
64630 +       int reg, unsigned long value)
64631 +{
64632 +       return _hypercall2(int, set_debugreg, reg, value);
64633 +}
64634 +
64635 +static inline unsigned long
64636 +HYPERVISOR_get_debugreg(
64637 +       int reg)
64638 +{
64639 +       return _hypercall1(unsigned long, get_debugreg, reg);
64640 +}
64641 +
64642 +static inline int
64643 +HYPERVISOR_update_descriptor(
64644 +       u64 ma, u64 desc)
64645 +{
64646 +       return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
64647 +}
64648 +
64649 +static inline int
64650 +HYPERVISOR_memory_op(
64651 +       unsigned int cmd, void *arg)
64652 +{
64653 +       return _hypercall2(int, memory_op, cmd, arg);
64654 +}
64655 +
64656 +static inline int
64657 +HYPERVISOR_multicall(
64658 +       void *call_list, int nr_calls)
64659 +{
64660 +       return _hypercall2(int, multicall, call_list, nr_calls);
64661 +}
64662 +
64663 +static inline int
64664 +HYPERVISOR_update_va_mapping(
64665 +       unsigned long va, pte_t new_val, unsigned long flags)
64666 +{
64667 +       unsigned long pte_hi = 0;
64668 +#ifdef CONFIG_X86_PAE
64669 +       pte_hi = new_val.pte_high;
64670 +#endif
64671 +       return _hypercall4(int, update_va_mapping, va,
64672 +                          new_val.pte_low, pte_hi, flags);
64673 +}
64674 +
64675 +static inline int
64676 +HYPERVISOR_event_channel_op(
64677 +       void *op)
64678 +{
64679 +       return _hypercall1(int, event_channel_op, op);
64680 +}
64681 +
64682 +static inline int
64683 +HYPERVISOR_xen_version(
64684 +       int cmd, void *arg)
64685 +{
64686 +       return _hypercall2(int, xen_version, cmd, arg);
64687 +}
64688 +
64689 +static inline int
64690 +HYPERVISOR_console_io(
64691 +       int cmd, int count, char *str)
64692 +{
64693 +       return _hypercall3(int, console_io, cmd, count, str);
64694 +}
64695 +
64696 +static inline int
64697 +HYPERVISOR_physdev_op(
64698 +       void *physdev_op)
64699 +{
64700 +       return _hypercall1(int, physdev_op, physdev_op);
64701 +}
64702 +
64703 +static inline int
64704 +HYPERVISOR_grant_table_op(
64705 +       unsigned int cmd, void *uop, unsigned int count)
64706 +{
64707 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
64708 +}
64709 +
64710 +static inline int
64711 +HYPERVISOR_update_va_mapping_otherdomain(
64712 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
64713 +{
64714 +       unsigned long pte_hi = 0;
64715 +#ifdef CONFIG_X86_PAE
64716 +       pte_hi = new_val.pte_high;
64717 +#endif
64718 +       return _hypercall5(int, update_va_mapping_otherdomain, va,
64719 +                          new_val.pte_low, pte_hi, flags, domid);
64720 +}
64721 +
64722 +static inline int
64723 +HYPERVISOR_vm_assist(
64724 +       unsigned int cmd, unsigned int type)
64725 +{
64726 +       return _hypercall2(int, vm_assist, cmd, type);
64727 +}
64728 +
64729 +static inline int
64730 +HYPERVISOR_vcpu_op(
64731 +       int cmd, int vcpuid, void *extra_args)
64732 +{
64733 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
64734 +}
64735 +
64736 +static inline int
64737 +HYPERVISOR_suspend(
64738 +       unsigned long srec)
64739 +{
64740 +       struct sched_shutdown sched_shutdown = {
64741 +               .reason = SHUTDOWN_suspend
64742 +       };
64743 +
64744 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
64745 +                            &sched_shutdown, srec);
64746 +
64747 +       if (rc == -ENOSYS)
64748 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
64749 +                                SHUTDOWN_suspend, srec);
64750 +
64751 +       return rc;
64752 +}
64753 +
64754 +static inline int
64755 +HYPERVISOR_nmi_op(
64756 +       unsigned long op, void *arg)
64757 +{
64758 +       return _hypercall2(int, nmi_op, op, arg);
64759 +}
64760 +
64761 +#endif /* __HYPERCALL_H__ */
64762 +
64763 +/*
64764 + * Local variables:
64765 + *  c-file-style: "linux"
64766 + *  indent-tabs-mode: t
64767 + *  c-indent-level: 8
64768 + *  c-basic-offset: 8
64769 + *  tab-width: 8
64770 + * End:
64771 + */
64772 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/hypervisor.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/hypervisor.h
64773 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/hypervisor.h       1970-01-01 01:00:00.000000000 +0100
64774 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/hypervisor.h  2006-04-10 00:05:52.000000000 +0200
64775 @@ -0,0 +1,224 @@
64776 +/******************************************************************************
64777 + * hypervisor.h
64778 + * 
64779 + * Linux-specific hypervisor handling.
64780 + * 
64781 + * Copyright (c) 2002-2004, K A Fraser
64782 + * 
64783 + * This program is free software; you can redistribute it and/or
64784 + * modify it under the terms of the GNU General Public License version 2
64785 + * as published by the Free Software Foundation; or, when distributed
64786 + * separately from the Linux kernel or incorporated into other
64787 + * software packages, subject to the following license:
64788 + * 
64789 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64790 + * of this source file (the "Software"), to deal in the Software without
64791 + * restriction, including without limitation the rights to use, copy, modify,
64792 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64793 + * and to permit persons to whom the Software is furnished to do so, subject to
64794 + * the following conditions:
64795 + * 
64796 + * The above copyright notice and this permission notice shall be included in
64797 + * all copies or substantial portions of the Software.
64798 + * 
64799 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64800 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64801 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64802 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64803 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64804 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64805 + * IN THE SOFTWARE.
64806 + */
64807 +
64808 +#ifndef __HYPERVISOR_H__
64809 +#define __HYPERVISOR_H__
64810 +
64811 +#include <linux/config.h>
64812 +#include <linux/types.h>
64813 +#include <linux/kernel.h>
64814 +#include <linux/version.h>
64815 +#include <linux/errno.h>
64816 +#include <xen/interface/xen.h>
64817 +#include <xen/interface/dom0_ops.h>
64818 +#include <xen/interface/sched.h>
64819 +#include <xen/interface/nmi.h>
64820 +#include <asm/ptrace.h>
64821 +#include <asm/page.h>
64822 +#if defined(__i386__)
64823 +#  ifdef CONFIG_X86_PAE
64824 +#   include <asm-generic/pgtable-nopud.h>
64825 +#  else
64826 +#   include <asm-generic/pgtable-nopmd.h>
64827 +#  endif
64828 +#endif
64829 +
64830 +extern shared_info_t *HYPERVISOR_shared_info;
64831 +
64832 +/* arch/xen/i386/kernel/setup.c */
64833 +extern start_info_t *xen_start_info;
64834 +
64835 +/* arch/xen/kernel/evtchn.c */
64836 +/* Force a proper event-channel callback from Xen. */
64837 +void force_evtchn_callback(void);
64838 +
64839 +/* arch/xen/kernel/process.c */
64840 +void xen_cpu_idle (void);
64841 +
64842 +/* arch/xen/i386/kernel/hypervisor.c */
64843 +void do_hypervisor_callback(struct pt_regs *regs);
64844 +
64845 +/* arch/xen/i386/mm/hypervisor.c */
64846 +/*
64847 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
64848 + * be MACHINE addresses.
64849 + */
64850 +
64851 +void xen_pt_switch(unsigned long ptr);
64852 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
64853 +void xen_load_gs(unsigned int selector); /* x86_64 only */
64854 +void xen_tlb_flush(void);
64855 +void xen_invlpg(unsigned long ptr);
64856 +
64857 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
64858 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
64859 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
64860 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
64861 +void xen_pgd_pin(unsigned long ptr);
64862 +void xen_pgd_unpin(unsigned long ptr);
64863 +
64864 +void xen_set_ldt(unsigned long ptr, unsigned long bytes);
64865 +void xen_machphys_update(unsigned long mfn, unsigned long pfn);
64866 +
64867 +#ifdef CONFIG_SMP
64868 +#include <linux/cpumask.h>
64869 +void xen_tlb_flush_all(void);
64870 +void xen_invlpg_all(unsigned long ptr);
64871 +void xen_tlb_flush_mask(cpumask_t *mask);
64872 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
64873 +#endif
64874 +
64875 +/* Returns zero on success else negative errno. */
64876 +int xen_create_contiguous_region(
64877 +    unsigned long vstart, unsigned int order, unsigned int address_bits);
64878 +void xen_destroy_contiguous_region(
64879 +    unsigned long vstart, unsigned int order);
64880 +
64881 +/* Turn jiffies into Xen system time. */
64882 +u64 jiffies_to_st(unsigned long jiffies);
64883 +
64884 +#include <asm/hypercall.h>
64885 +
64886 +#if defined(CONFIG_X86_64)
64887 +#define MULTI_UVMFLAGS_INDEX 2
64888 +#define MULTI_UVMDOMID_INDEX 3
64889 +#else
64890 +#define MULTI_UVMFLAGS_INDEX 3
64891 +#define MULTI_UVMDOMID_INDEX 4
64892 +#endif
64893 +
64894 +#define xen_init()     (0)
64895 +
64896 +static inline int
64897 +HYPERVISOR_yield(
64898 +       void)
64899 +{
64900 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
64901 +
64902 +       if (rc == -ENOSYS)
64903 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
64904 +
64905 +       return rc;
64906 +}
64907 +
64908 +static inline int
64909 +HYPERVISOR_block(
64910 +       void)
64911 +{
64912 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
64913 +
64914 +       if (rc == -ENOSYS)
64915 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
64916 +
64917 +       return rc;
64918 +}
64919 +
64920 +static inline int
64921 +HYPERVISOR_shutdown(
64922 +       unsigned int reason)
64923 +{
64924 +       struct sched_shutdown sched_shutdown = {
64925 +               .reason = reason
64926 +       };
64927 +
64928 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
64929 +
64930 +       if (rc == -ENOSYS)
64931 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
64932 +
64933 +       return rc;
64934 +}
64935 +
64936 +static inline int
64937 +HYPERVISOR_poll(
64938 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
64939 +{
64940 +       struct sched_poll sched_poll = {
64941 +               .ports = ports,
64942 +               .nr_ports = nr_ports,
64943 +               .timeout = jiffies_to_st(timeout)
64944 +       };
64945 +
64946 +       int rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
64947 +
64948 +       if (rc == -ENOSYS)
64949 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
64950 +
64951 +       return rc;
64952 +}
64953 +
64954 +static inline void
64955 +MULTI_update_va_mapping(
64956 +    multicall_entry_t *mcl, unsigned long va,
64957 +    pte_t new_val, unsigned long flags)
64958 +{
64959 +    mcl->op = __HYPERVISOR_update_va_mapping;
64960 +    mcl->args[0] = va;
64961 +#if defined(CONFIG_X86_64)
64962 +    mcl->args[1] = new_val.pte;
64963 +    mcl->args[2] = flags;
64964 +#elif defined(CONFIG_X86_PAE)
64965 +    mcl->args[1] = new_val.pte_low;
64966 +    mcl->args[2] = new_val.pte_high;
64967 +    mcl->args[3] = flags;
64968 +#else
64969 +    mcl->args[1] = new_val.pte_low;
64970 +    mcl->args[2] = 0;
64971 +    mcl->args[3] = flags;
64972 +#endif
64973 +}
64974 +
64975 +static inline void
64976 +MULTI_update_va_mapping_otherdomain(
64977 +    multicall_entry_t *mcl, unsigned long va,
64978 +    pte_t new_val, unsigned long flags, domid_t domid)
64979 +{
64980 +    mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
64981 +    mcl->args[0] = va;
64982 +#if defined(CONFIG_X86_64)
64983 +    mcl->args[1] = new_val.pte;
64984 +    mcl->args[2] = flags;
64985 +    mcl->args[3] = domid;
64986 +#elif defined(CONFIG_X86_PAE)
64987 +    mcl->args[1] = new_val.pte_low;
64988 +    mcl->args[2] = new_val.pte_high;
64989 +    mcl->args[3] = flags;
64990 +    mcl->args[4] = domid;
64991 +#else
64992 +    mcl->args[1] = new_val.pte_low;
64993 +    mcl->args[2] = 0;
64994 +    mcl->args[3] = flags;
64995 +    mcl->args[4] = domid;
64996 +#endif
64997 +}
64998 +
64999 +#endif /* __HYPERVISOR_H__ */
65000 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/io.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/io.h
65001 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/io.h       1970-01-01 01:00:00.000000000 +0100
65002 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/io.h  2006-04-10 00:05:52.000000000 +0200
65003 @@ -0,0 +1,401 @@
65004 +#ifndef _ASM_IO_H
65005 +#define _ASM_IO_H
65006 +
65007 +#include <linux/config.h>
65008 +#include <linux/string.h>
65009 +#include <linux/compiler.h>
65010 +
65011 +/*
65012 + * This file contains the definitions for the x86 IO instructions
65013 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
65014 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
65015 + * versions of the single-IO instructions (inb_p/inw_p/..).
65016 + *
65017 + * This file is not meant to be obfuscating: it's just complicated
65018 + * to (a) handle it all in a way that makes gcc able to optimize it
65019 + * as well as possible and (b) trying to avoid writing the same thing
65020 + * over and over again with slight variations and possibly making a
65021 + * mistake somewhere.
65022 + */
65023 +
65024 +/*
65025 + * Thanks to James van Artsdalen for a better timing-fix than
65026 + * the two short jumps: using outb's to a nonexistent port seems
65027 + * to guarantee better timings even on fast machines.
65028 + *
65029 + * On the other hand, I'd like to be sure of a non-existent port:
65030 + * I feel a bit unsafe about using 0x80 (should be safe, though)
65031 + *
65032 + *             Linus
65033 + */
65034 +
65035 + /*
65036 +  *  Bit simplified and optimized by Jan Hubicka
65037 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
65038 +  *
65039 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
65040 +  *  isa_read[wl] and isa_write[wl] fixed
65041 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
65042 +  */
65043 +
65044 +#define IO_SPACE_LIMIT 0xffff
65045 +
65046 +#define XQUAD_PORTIO_BASE 0xfe400000
65047 +#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
65048 +
65049 +#ifdef __KERNEL__
65050 +
65051 +#include <asm-generic/iomap.h>
65052 +
65053 +#include <linux/vmalloc.h>
65054 +#include <asm/fixmap.h>
65055 +
65056 +/*
65057 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
65058 + * access
65059 + */
65060 +#define xlate_dev_mem_ptr(p)   __va(p)
65061 +
65062 +/*
65063 + * Convert a virtual cached pointer to an uncached pointer
65064 + */
65065 +#define xlate_dev_kmem_ptr(p)  p
65066 +
65067 +/**
65068 + *     virt_to_phys    -       map virtual addresses to physical
65069 + *     @address: address to remap
65070 + *
65071 + *     The returned physical address is the physical (CPU) mapping for
65072 + *     the memory address given. It is only valid to use this function on
65073 + *     addresses directly mapped or allocated via kmalloc. 
65074 + *
65075 + *     This function does not give bus mappings for DMA transfers. In
65076 + *     almost all conceivable cases a device driver should not be using
65077 + *     this function
65078 + */
65079
65080 +static inline unsigned long virt_to_phys(volatile void * address)
65081 +{
65082 +       return __pa(address);
65083 +}
65084 +
65085 +/**
65086 + *     phys_to_virt    -       map physical address to virtual
65087 + *     @address: address to remap
65088 + *
65089 + *     The returned virtual address is a current CPU mapping for
65090 + *     the memory address given. It is only valid to use this function on
65091 + *     addresses that have a kernel mapping
65092 + *
65093 + *     This function does not handle bus mappings for DMA transfers. In
65094 + *     almost all conceivable cases a device driver should not be using
65095 + *     this function
65096 + */
65097 +
65098 +static inline void * phys_to_virt(unsigned long address)
65099 +{
65100 +       return __va(address);
65101 +}
65102 +
65103 +/*
65104 + * Change "struct page" to physical address.
65105 + */
65106 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
65107 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
65108 +
65109 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
65110 +                                 (unsigned long) bio_offset((bio)))
65111 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
65112 +                                 (unsigned long) (bv)->bv_offset)
65113 +
65114 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
65115 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
65116 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
65117 +         bvec_to_pseudophys((vec2))))
65118 +
65119 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
65120 +
65121 +/**
65122 + * ioremap     -   map bus memory into CPU space
65123 + * @offset:    bus address of the memory
65124 + * @size:      size of the resource to map
65125 + *
65126 + * ioremap performs a platform specific sequence of operations to
65127 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
65128 + * writew/writel functions and the other mmio helpers. The returned
65129 + * address is not guaranteed to be usable directly as a virtual
65130 + * address. 
65131 + */
65132 +
65133 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
65134 +{
65135 +       return __ioremap(offset, size, 0);
65136 +}
65137 +
65138 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
65139 +extern void iounmap(volatile void __iomem *addr);
65140 +
65141 +/*
65142 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
65143 + * mappings, before the real ioremap() is functional.
65144 + * A boot-time mapping is currently limited to at most 16 pages.
65145 + */
65146 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
65147 +extern void bt_iounmap(void *addr, unsigned long size);
65148 +
65149 +/* Use early IO mappings for DMI because it's initialized early */
65150 +#define dmi_ioremap bt_ioremap
65151 +#define dmi_iounmap bt_iounmap
65152 +#define dmi_alloc alloc_bootmem
65153 +
65154 +/*
65155 + * ISA I/O bus memory addresses are 1:1 with the physical address.
65156 + */
65157 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
65158 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
65159 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
65160 +
65161 +/*
65162 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
65163 + * are forbidden in portable PCI drivers.
65164 + *
65165 + * Allow them on x86 for legacy drivers, though.
65166 + */
65167 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
65168 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
65169 +
65170 +/*
65171 + * readX/writeX() are used to access memory mapped devices. On some
65172 + * architectures the memory mapped IO stuff needs to be accessed
65173 + * differently. On the x86 architecture, we just read/write the
65174 + * memory location directly.
65175 + */
65176 +
65177 +static inline unsigned char readb(const volatile void __iomem *addr)
65178 +{
65179 +       return *(volatile unsigned char __force *) addr;
65180 +}
65181 +static inline unsigned short readw(const volatile void __iomem *addr)
65182 +{
65183 +       return *(volatile unsigned short __force *) addr;
65184 +}
65185 +static inline unsigned int readl(const volatile void __iomem *addr)
65186 +{
65187 +       return *(volatile unsigned int __force *) addr;
65188 +}
65189 +#define readb_relaxed(addr) readb(addr)
65190 +#define readw_relaxed(addr) readw(addr)
65191 +#define readl_relaxed(addr) readl(addr)
65192 +#define __raw_readb readb
65193 +#define __raw_readw readw
65194 +#define __raw_readl readl
65195 +
65196 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
65197 +{
65198 +       *(volatile unsigned char __force *) addr = b;
65199 +}
65200 +static inline void writew(unsigned short b, volatile void __iomem *addr)
65201 +{
65202 +       *(volatile unsigned short __force *) addr = b;
65203 +}
65204 +static inline void writel(unsigned int b, volatile void __iomem *addr)
65205 +{
65206 +       *(volatile unsigned int __force *) addr = b;
65207 +}
65208 +#define __raw_writeb writeb
65209 +#define __raw_writew writew
65210 +#define __raw_writel writel
65211 +
65212 +#define mmiowb()
65213 +
65214 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
65215 +{
65216 +       memset((void __force *) addr, val, count);
65217 +}
65218 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
65219 +{
65220 +       __memcpy(dst, (void __force *) src, count);
65221 +}
65222 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
65223 +{
65224 +       __memcpy((void __force *) dst, src, count);
65225 +}
65226 +
65227 +/*
65228 + * ISA space is 'always mapped' on a typical x86 system, no need to
65229 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
65230 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
65231 + * are physical addresses. The following constant pointer can be
65232 + * used as the IO-area pointer (it can be iounmapped as well, so the
65233 + * analogy with PCI is quite large):
65234 + */
65235 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
65236 +
65237 +#define isa_readb(a) readb(__ISA_IO_base + (a))
65238 +#define isa_readw(a) readw(__ISA_IO_base + (a))
65239 +#define isa_readl(a) readl(__ISA_IO_base + (a))
65240 +#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
65241 +#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
65242 +#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
65243 +#define isa_memset_io(a,b,c)           memset_io(__ISA_IO_base + (a),(b),(c))
65244 +#define isa_memcpy_fromio(a,b,c)       memcpy_fromio((a),__ISA_IO_base + (b),(c))
65245 +#define isa_memcpy_toio(a,b,c)         memcpy_toio(__ISA_IO_base + (a),(b),(c))
65246 +
65247 +
65248 +/*
65249 + * Again, i386 does not require mem IO specific function.
65250 + */
65251 +
65252 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void __force *)(b),(c),(d))
65253 +#define isa_eth_io_copy_and_sum(a,b,c,d)       eth_copy_and_sum((a),(void __force *)(__ISA_IO_base + (b)),(c),(d))
65254 +
65255 +/**
65256 + *     check_signature         -       find BIOS signatures
65257 + *     @io_addr: mmio address to check 
65258 + *     @signature:  signature block
65259 + *     @length: length of signature
65260 + *
65261 + *     Perform a signature comparison with the mmio address io_addr. This
65262 + *     address should have been obtained by ioremap.
65263 + *     Returns 1 on a match.
65264 + */
65265
65266 +static inline int check_signature(volatile void __iomem * io_addr,
65267 +       const unsigned char *signature, int length)
65268 +{
65269 +       int retval = 0;
65270 +       do {
65271 +               if (readb(io_addr) != *signature)
65272 +                       goto out;
65273 +               io_addr++;
65274 +               signature++;
65275 +               length--;
65276 +       } while (length);
65277 +       retval = 1;
65278 +out:
65279 +       return retval;
65280 +}
65281 +
65282 +/*
65283 + *     Cache management
65284 + *
65285 + *     This needed for two cases
65286 + *     1. Out of order aware processors
65287 + *     2. Accidentally out of order processors (PPro errata #51)
65288 + */
65289
65290 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
65291 +
65292 +static inline void flush_write_buffers(void)
65293 +{
65294 +       __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
65295 +}
65296 +
65297 +#define dma_cache_inv(_start,_size)            flush_write_buffers()
65298 +#define dma_cache_wback(_start,_size)          flush_write_buffers()
65299 +#define dma_cache_wback_inv(_start,_size)      flush_write_buffers()
65300 +
65301 +#else
65302 +
65303 +/* Nothing to do */
65304 +
65305 +#define dma_cache_inv(_start,_size)            do { } while (0)
65306 +#define dma_cache_wback(_start,_size)          do { } while (0)
65307 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
65308 +#define flush_write_buffers()
65309 +
65310 +#endif
65311 +
65312 +#endif /* __KERNEL__ */
65313 +
65314 +#ifdef SLOW_IO_BY_JUMPING
65315 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
65316 +#else
65317 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
65318 +#endif
65319 +
65320 +static inline void slow_down_io(void) {
65321 +       __asm__ __volatile__(
65322 +               __SLOW_DOWN_IO
65323 +#ifdef REALLY_SLOW_IO
65324 +               __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
65325 +#endif
65326 +               : : );
65327 +}
65328 +
65329 +#ifdef CONFIG_X86_NUMAQ
65330 +extern void *xquad_portio;    /* Where the IO area was mapped */
65331 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
65332 +#define __BUILDIO(bwl,bw,type) \
65333 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
65334 +       if (xquad_portio) \
65335 +               write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
65336 +       else \
65337 +               out##bwl##_local(value, port); \
65338 +} \
65339 +static inline void out##bwl(unsigned type value, int port) { \
65340 +       out##bwl##_quad(value, port, 0); \
65341 +} \
65342 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
65343 +       if (xquad_portio) \
65344 +               return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
65345 +       else \
65346 +               return in##bwl##_local(port); \
65347 +} \
65348 +static inline unsigned type in##bwl(int port) { \
65349 +       return in##bwl##_quad(port, 0); \
65350 +}
65351 +#else
65352 +#define __BUILDIO(bwl,bw,type) \
65353 +static inline void out##bwl(unsigned type value, int port) { \
65354 +       out##bwl##_local(value, port); \
65355 +} \
65356 +static inline unsigned type in##bwl(int port) { \
65357 +       return in##bwl##_local(port); \
65358 +}
65359 +#endif
65360 +
65361 +
65362 +#define BUILDIO(bwl,bw,type) \
65363 +static inline void out##bwl##_local(unsigned type value, int port) { \
65364 +       __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
65365 +} \
65366 +static inline unsigned type in##bwl##_local(int port) { \
65367 +       unsigned type value; \
65368 +       __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
65369 +       return value; \
65370 +} \
65371 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
65372 +       out##bwl##_local(value, port); \
65373 +       slow_down_io(); \
65374 +} \
65375 +static inline unsigned type in##bwl##_local_p(int port) { \
65376 +       unsigned type value = in##bwl##_local(port); \
65377 +       slow_down_io(); \
65378 +       return value; \
65379 +} \
65380 +__BUILDIO(bwl,bw,type) \
65381 +static inline void out##bwl##_p(unsigned type value, int port) { \
65382 +       out##bwl(value, port); \
65383 +       slow_down_io(); \
65384 +} \
65385 +static inline unsigned type in##bwl##_p(int port) { \
65386 +       unsigned type value = in##bwl(port); \
65387 +       slow_down_io(); \
65388 +       return value; \
65389 +} \
65390 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
65391 +       __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
65392 +} \
65393 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
65394 +       __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
65395 +}
65396 +
65397 +BUILDIO(b,b,char)
65398 +BUILDIO(w,w,short)
65399 +BUILDIO(l,,int)
65400 +
65401 +/* We will be supplying our own /dev/mem implementation */
65402 +#define ARCH_HAS_DEV_MEM
65403 +
65404 +#endif
65405 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/kmap_types.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/kmap_types.h
65406 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/kmap_types.h       1970-01-01 01:00:00.000000000 +0100
65407 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/kmap_types.h  2006-04-10 00:05:52.000000000 +0200
65408 @@ -0,0 +1,32 @@
65409 +#ifndef _ASM_KMAP_TYPES_H
65410 +#define _ASM_KMAP_TYPES_H
65411 +
65412 +#include <linux/config.h>
65413 +
65414 +#ifdef CONFIG_DEBUG_HIGHMEM
65415 +# define D(n) __KM_FENCE_##n ,
65416 +#else
65417 +# define D(n)
65418 +#endif
65419 +
65420 +enum km_type {
65421 +D(0)   KM_BOUNCE_READ,
65422 +D(1)   KM_SKB_SUNRPC_DATA,
65423 +D(2)   KM_SKB_DATA_SOFTIRQ,
65424 +D(3)   KM_USER0,
65425 +D(4)   KM_USER1,
65426 +D(5)   KM_BIO_SRC_IRQ,
65427 +D(6)   KM_BIO_DST_IRQ,
65428 +D(7)   KM_PTE0,
65429 +D(8)   KM_PTE1,
65430 +D(9)   KM_IRQ0,
65431 +D(10)  KM_IRQ1,
65432 +D(11)  KM_SOFTIRQ0,
65433 +D(12)  KM_SOFTIRQ1,
65434 +D(13)  KM_SWIOTLB,
65435 +D(14)  KM_TYPE_NR
65436 +};
65437 +
65438 +#undef D
65439 +
65440 +#endif
65441 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/mmu_context.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/mmu_context.h
65442 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/mmu_context.h      1970-01-01 01:00:00.000000000 +0100
65443 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/mmu_context.h 2006-04-10 00:05:52.000000000 +0200
65444 @@ -0,0 +1,105 @@
65445 +#ifndef __I386_SCHED_H
65446 +#define __I386_SCHED_H
65447 +
65448 +#include <linux/config.h>
65449 +#include <asm/desc.h>
65450 +#include <asm/atomic.h>
65451 +#include <asm/pgalloc.h>
65452 +#include <asm/tlbflush.h>
65453 +
65454 +/*
65455 + * Used for LDT copy/destruction.
65456 + */
65457 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
65458 +void destroy_context(struct mm_struct *mm);
65459 +
65460 +
65461 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
65462 +{
65463 +#if 0 /* XEN: no lazy tlb */
65464 +       unsigned cpu = smp_processor_id();
65465 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
65466 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
65467 +#endif
65468 +}
65469 +
65470 +#define prepare_arch_switch(next)      __prepare_arch_switch()
65471 +
65472 +static inline void __prepare_arch_switch(void)
65473 +{
65474 +       /*
65475 +        * Save away %fs and %gs. No need to save %es and %ds, as those
65476 +        * are always kernel segments while inside the kernel. Must
65477 +        * happen before reload of cr3/ldt (i.e., not in __switch_to).
65478 +        */
65479 +       asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
65480 +               : "=m" (current->thread.fs),
65481 +                 "=m" (current->thread.gs));
65482 +       asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
65483 +               : : "r" (0) );
65484 +}
65485 +
65486 +extern void mm_pin(struct mm_struct *mm);
65487 +extern void mm_unpin(struct mm_struct *mm);
65488 +void mm_pin_all(void);
65489 +
65490 +static inline void switch_mm(struct mm_struct *prev,
65491 +                            struct mm_struct *next,
65492 +                            struct task_struct *tsk)
65493 +{
65494 +       int cpu = smp_processor_id();
65495 +       struct mmuext_op _op[2], *op = _op;
65496 +
65497 +       if (likely(prev != next)) {
65498 +               if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
65499 +                       mm_pin(next);
65500 +
65501 +               /* stop flush ipis for the previous mm */
65502 +               cpu_clear(cpu, prev->cpu_vm_mask);
65503 +#if 0 /* XEN: no lazy tlb */
65504 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
65505 +               per_cpu(cpu_tlbstate, cpu).active_mm = next;
65506 +#endif
65507 +               cpu_set(cpu, next->cpu_vm_mask);
65508 +
65509 +               /* Re-load page tables: load_cr3(next->pgd) */
65510 +               op->cmd = MMUEXT_NEW_BASEPTR;
65511 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
65512 +               op++;
65513 +
65514 +               /*
65515 +                * load the LDT, if the LDT is different:
65516 +                */
65517 +               if (unlikely(prev->context.ldt != next->context.ldt)) {
65518 +                       /* load_LDT_nolock(&next->context, cpu) */
65519 +                       op->cmd = MMUEXT_SET_LDT;
65520 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
65521 +                       op->arg2.nr_ents     = next->context.size;
65522 +                       op++;
65523 +               }
65524 +
65525 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
65526 +       }
65527 +#if 0 /* XEN: no lazy tlb */
65528 +       else {
65529 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
65530 +               BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
65531 +
65532 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
65533 +                       /* We were in lazy tlb mode and leave_mm disabled 
65534 +                        * tlb flush IPI delivery. We must reload %cr3.
65535 +                        */
65536 +                       load_cr3(next->pgd);
65537 +                       load_LDT_nolock(&next->context, cpu);
65538 +               }
65539 +       }
65540 +#endif
65541 +}
65542 +
65543 +#define deactivate_mm(tsk, mm) \
65544 +       asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
65545 +
65546 +#define activate_mm(prev, next) \
65547 +       switch_mm((prev),(next),NULL)
65548 +
65549 +#endif
65550 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/mmu.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/mmu.h
65551 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/mmu.h      1970-01-01 01:00:00.000000000 +0100
65552 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/mmu.h 2006-04-10 00:05:52.000000000 +0200
65553 @@ -0,0 +1,21 @@
65554 +#ifndef __i386_MMU_H
65555 +#define __i386_MMU_H
65556 +
65557 +#include <asm/semaphore.h>
65558 +/*
65559 + * The i386 doesn't have a mmu context, but
65560 + * we put the segment information here.
65561 + *
65562 + * cpu_vm_mask is used to optimize ldt flushing.
65563 + */
65564 +typedef struct { 
65565 +       int size;
65566 +       struct semaphore sem;
65567 +       void *ldt;
65568 +} mm_context_t;
65569 +
65570 +/* mm/memory.c:exit_mmap hook */
65571 +extern void _arch_exit_mmap(struct mm_struct *mm);
65572 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
65573 +
65574 +#endif
65575 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/page.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/page.h
65576 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/page.h     1970-01-01 01:00:00.000000000 +0100
65577 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/page.h        2006-04-10 00:05:52.000000000 +0200
65578 @@ -0,0 +1,327 @@
65579 +#ifndef _I386_PAGE_H
65580 +#define _I386_PAGE_H
65581 +
65582 +/* PAGE_SHIFT determines the page size */
65583 +#define PAGE_SHIFT     12
65584 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
65585 +#define PAGE_MASK      (~(PAGE_SIZE-1))
65586 +
65587 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
65588 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
65589 +
65590 +#ifdef __KERNEL__
65591 +#ifndef __ASSEMBLY__
65592 +
65593 +#include <linux/config.h>
65594 +#include <linux/string.h>
65595 +#include <linux/types.h>
65596 +#include <linux/kernel.h>
65597 +#include <asm/bug.h>
65598 +#include <xen/interface/xen.h>
65599 +#include <xen/features.h>
65600 +#include <xen/foreign_page.h>
65601 +
65602 +#define arch_free_page(_page,_order)                   \
65603 +({     int foreign = PageForeign(_page);               \
65604 +       if (foreign)                                    \
65605 +               (PageForeignDestructor(_page))(_page);  \
65606 +       foreign;                                        \
65607 +})
65608 +#define HAVE_ARCH_FREE_PAGE
65609 +
65610 +#ifdef CONFIG_XEN_SCRUB_PAGES
65611 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
65612 +#else
65613 +#define scrub_pages(_p,_n) ((void)0)
65614 +#endif
65615 +
65616 +#ifdef CONFIG_X86_USE_3DNOW
65617 +
65618 +#include <asm/mmx.h>
65619 +
65620 +#define clear_page(page)       mmx_clear_page((void *)(page))
65621 +#define copy_page(to,from)     mmx_copy_page(to,from)
65622 +
65623 +#else
65624 +
65625 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
65626 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
65627 +
65628 +/*
65629 + *     On older X86 processors it's not a win to use MMX here it seems.
65630 + *     Maybe the K6-III ?
65631 + */
65632
65633 +#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
65634 +#define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
65635 +
65636 +#endif
65637 +
65638 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
65639 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
65640 +
65641 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
65642 +#define INVALID_P2M_ENTRY      (~0UL)
65643 +#define FOREIGN_FRAME_BIT      (1UL<<31)
65644 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
65645 +
65646 +extern unsigned long *phys_to_machine_mapping;
65647 +
65648 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
65649 +{
65650 +       if (xen_feature(XENFEAT_auto_translated_physmap))
65651 +               return pfn;
65652 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
65653 +               ~FOREIGN_FRAME_BIT;
65654 +}
65655 +
65656 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
65657 +{
65658 +       if (xen_feature(XENFEAT_auto_translated_physmap))
65659 +               return 1;
65660 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
65661 +}
65662 +
65663 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
65664 +{
65665 +       unsigned long pfn;
65666 +
65667 +       if (xen_feature(XENFEAT_auto_translated_physmap))
65668 +               return mfn;
65669 +
65670 +       /*
65671 +        * The array access can fail (e.g., device space beyond end of RAM).
65672 +        * In such cases it doesn't matter what we return (we return garbage),
65673 +        * but we must handle the fault without crashing!
65674 +        */
65675 +       asm (
65676 +               "1:     movl %1,%0\n"
65677 +               "2:\n"
65678 +               ".section __ex_table,\"a\"\n"
65679 +               "       .align 4\n"
65680 +               "       .long 1b,2b\n"
65681 +               ".previous"
65682 +               : "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) );
65683 +
65684 +       return pfn;
65685 +}
65686 +
65687 +/*
65688 + * We detect special mappings in one of two ways:
65689 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
65690 + *     to be outside our maximum possible pseudophys range.
65691 + *  2. If the MFN belongs to a different domain then we will certainly
65692 + *     not have MFN in our p2m table. Conversely, if the page is ours,
65693 + *     then we'll have p2m(m2p(MFN))==MFN.
65694 + * If we detect a special mapping then it doesn't have a 'struct page'.
65695 + * We force !pfn_valid() by returning an out-of-range pointer.
65696 + *
65697 + * NB. These checks require that, for any MFN that is not in our reservation,
65698 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
65699 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
65700 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
65701 + *
65702 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
65703 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
65704 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
65705 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
65706 + */
65707 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
65708 +{
65709 +       extern unsigned long max_mapnr;
65710 +       unsigned long pfn = mfn_to_pfn(mfn);
65711 +       if ((pfn < max_mapnr)
65712 +           && !xen_feature(XENFEAT_auto_translated_physmap)
65713 +           && (phys_to_machine_mapping[pfn] != mfn))
65714 +               return max_mapnr; /* force !pfn_valid() */
65715 +       return pfn;
65716 +}
65717 +
65718 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
65719 +{
65720 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
65721 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
65722 +               return;
65723 +       }
65724 +       phys_to_machine_mapping[pfn] = mfn;
65725 +}
65726 +
65727 +/* Definitions for machine and pseudophysical addresses. */
65728 +#ifdef CONFIG_X86_PAE
65729 +typedef unsigned long long paddr_t;
65730 +typedef unsigned long long maddr_t;
65731 +#else
65732 +typedef unsigned long paddr_t;
65733 +typedef unsigned long maddr_t;
65734 +#endif
65735 +
65736 +static inline maddr_t phys_to_machine(paddr_t phys)
65737 +{
65738 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
65739 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
65740 +       return machine;
65741 +}
65742 +static inline paddr_t machine_to_phys(maddr_t machine)
65743 +{
65744 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
65745 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
65746 +       return phys;
65747 +}
65748 +
65749 +/*
65750 + * These are used to make use of C type-checking..
65751 + */
65752 +extern int nx_enabled;
65753 +#ifdef CONFIG_X86_PAE
65754 +extern unsigned long long __supported_pte_mask;
65755 +typedef struct { unsigned long pte_low, pte_high; } pte_t;
65756 +typedef struct { unsigned long long pmd; } pmd_t;
65757 +typedef struct { unsigned long long pgd; } pgd_t;
65758 +typedef struct { unsigned long long pgprot; } pgprot_t;
65759 +#define __pte(x) ({ unsigned long long _x = (x);        \
65760 +    if (_x & 1) _x = phys_to_machine(_x);               \
65761 +    ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
65762 +#define __pgd(x) ({ unsigned long long _x = (x); \
65763 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
65764 +#define __pmd(x) ({ unsigned long long _x = (x); \
65765 +    (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
65766 +static inline unsigned long long pte_val(pte_t x)
65767 +{
65768 +       unsigned long long ret;
65769 +
65770 +       if (x.pte_low) {
65771 +               ret = x.pte_low | (unsigned long long)x.pte_high << 32;
65772 +               ret = machine_to_phys(ret) | 1;
65773 +       } else {
65774 +               ret = 0;
65775 +       }
65776 +       return ret;
65777 +}
65778 +static inline unsigned long long pmd_val(pmd_t x)
65779 +{
65780 +       unsigned long long ret = x.pmd;
65781 +       if (ret) ret = machine_to_phys(ret) | 1;
65782 +       return ret;
65783 +}
65784 +static inline unsigned long long pgd_val(pgd_t x)
65785 +{
65786 +       unsigned long long ret = x.pgd;
65787 +       if (ret) ret = machine_to_phys(ret) | 1;
65788 +       return ret;
65789 +}
65790 +static inline unsigned long long pte_val_ma(pte_t x)
65791 +{
65792 +       return (unsigned long long)x.pte_high << 32 | x.pte_low;
65793 +}
65794 +#define HPAGE_SHIFT    21
65795 +#else
65796 +typedef struct { unsigned long pte_low; } pte_t;
65797 +typedef struct { unsigned long pgd; } pgd_t;
65798 +typedef struct { unsigned long pgprot; } pgprot_t;
65799 +#define boot_pte_t pte_t /* or would you rather have a typedef */
65800 +#define pte_val(x)     (((x).pte_low & 1) ? machine_to_phys((x).pte_low) : \
65801 +                        (x).pte_low)
65802 +#define pte_val_ma(x)  ((x).pte_low)
65803 +#define __pte(x) ({ unsigned long _x = (x); \
65804 +    (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
65805 +#define __pgd(x) ({ unsigned long _x = (x); \
65806 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
65807 +static inline unsigned long pgd_val(pgd_t x)
65808 +{
65809 +       unsigned long ret = x.pgd;
65810 +       if (ret) ret = machine_to_phys(ret) | 1;
65811 +       return ret;
65812 +}
65813 +#define HPAGE_SHIFT    22
65814 +#endif
65815 +#define PTE_MASK       PAGE_MASK
65816 +
65817 +#ifdef CONFIG_HUGETLB_PAGE
65818 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
65819 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
65820 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
65821 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
65822 +#endif
65823 +
65824 +#define pgprot_val(x)  ((x).pgprot)
65825 +
65826 +#define __pte_ma(x)    ((pte_t) { (x) } )
65827 +#define __pgprot(x)    ((pgprot_t) { (x) } )
65828 +
65829 +#endif /* !__ASSEMBLY__ */
65830 +
65831 +/* to align the pointer to the (next) page boundary */
65832 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
65833 +
65834 +/*
65835 + * This handles the memory map.. We could make this a config
65836 + * option, but too many people screw it up, and too few need
65837 + * it.
65838 + *
65839 + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
65840 + * a virtual address space of one gigabyte, which limits the
65841 + * amount of physical memory you can use to about 950MB. 
65842 + *
65843 + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
65844 + * and CONFIG_HIGHMEM64G options in the kernel configuration.
65845 + */
65846 +
65847 +#ifndef __ASSEMBLY__
65848 +
65849 +/*
65850 + * This much address space is reserved for vmalloc() and iomap()
65851 + * as well as fixmap mappings.
65852 + */
65853 +extern unsigned int __VMALLOC_RESERVE;
65854 +
65855 +extern int sysctl_legacy_va_layout;
65856 +
65857 +extern int page_is_ram(unsigned long pagenr);
65858 +
65859 +#endif /* __ASSEMBLY__ */
65860 +
65861 +#ifdef __ASSEMBLY__
65862 +#define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
65863 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
65864 +#else
65865 +#define __PAGE_OFFSET          ((unsigned long)CONFIG_PAGE_OFFSET)
65866 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
65867 +#endif
65868 +#define __KERNEL_START         (__PAGE_OFFSET + __PHYSICAL_START)
65869 +
65870 +#undef LOAD_OFFSET
65871 +#define LOAD_OFFSET            0
65872 +
65873 +
65874 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
65875 +#define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
65876 +#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
65877 +#define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
65878 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
65879 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
65880 +#ifdef CONFIG_FLATMEM
65881 +#define pfn_to_page(pfn)       (mem_map + (pfn))
65882 +#define page_to_pfn(page)      ((unsigned long)((page) - mem_map))
65883 +#define pfn_valid(pfn)         ((pfn) < max_mapnr)
65884 +#endif /* CONFIG_FLATMEM */
65885 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
65886 +
65887 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
65888 +
65889 +#define VM_DATA_DEFAULT_FLAGS \
65890 +       (VM_READ | VM_WRITE | \
65891 +       ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
65892 +                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
65893 +
65894 +/* VIRT <-> MACHINE conversion */
65895 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
65896 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
65897 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
65898 +
65899 +#define __HAVE_ARCH_GATE_AREA 1
65900 +
65901 +#endif /* __KERNEL__ */
65902 +
65903 +#include <asm-generic/page.h>
65904 +
65905 +#endif /* _I386_PAGE_H */
65906 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/param.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/param.h
65907 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/param.h    1970-01-01 01:00:00.000000000 +0100
65908 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/param.h       2006-04-10 00:05:52.000000000 +0200
65909 @@ -0,0 +1,24 @@
65910 +#ifndef _ASMi386_PARAM_H
65911 +#define _ASMi386_PARAM_H
65912 +
65913 +#ifdef __KERNEL__
65914 +# include <linux/config.h>
65915 +# define HZ            CONFIG_HZ       /* Internal kernel timer frequency */
65916 +# define USER_HZ       100             /* .. some user interfaces are in "ticks" */
65917 +# define CLOCKS_PER_SEC                (USER_HZ)       /* like times() */
65918 +#endif
65919 +
65920 +#ifndef HZ
65921 +#define HZ 100
65922 +#endif
65923 +
65924 +#define EXEC_PAGESIZE  4096
65925 +
65926 +#ifndef NOGROUP
65927 +#define NOGROUP                (-1)
65928 +#endif
65929 +
65930 +#define MAXHOSTNAMELEN 64      /* max length of hostname */
65931 +#define COMMAND_LINE_SIZE 256
65932 +
65933 +#endif
65934 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pci.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pci.h
65935 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pci.h      1970-01-01 01:00:00.000000000 +0100
65936 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pci.h 2006-04-10 00:05:52.000000000 +0200
65937 @@ -0,0 +1,154 @@
65938 +#ifndef __i386_PCI_H
65939 +#define __i386_PCI_H
65940 +
65941 +#include <linux/config.h>
65942 +
65943 +#ifdef __KERNEL__
65944 +#include <linux/mm.h>          /* for struct page */
65945 +
65946 +/* Can be used to override the logic in pci_scan_bus for skipping
65947 +   already-configured bus numbers - to be used for buggy BIOSes
65948 +   or architectures with incomplete PCI setup by the loader */
65949 +
65950 +#ifdef CONFIG_PCI
65951 +extern unsigned int pcibios_assign_all_busses(void);
65952 +#else
65953 +#define pcibios_assign_all_busses()    0
65954 +#endif
65955 +#define pcibios_scan_all_fns(a, b)     0
65956 +
65957 +extern unsigned long pci_mem_start;
65958 +#define PCIBIOS_MIN_IO         0x1000
65959 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
65960 +
65961 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
65962 +
65963 +void pcibios_config_init(void);
65964 +struct pci_bus * pcibios_scan_root(int bus);
65965 +
65966 +void pcibios_set_master(struct pci_dev *dev);
65967 +void pcibios_penalize_isa_irq(int irq, int active);
65968 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
65969 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
65970 +
65971 +/* Dynamic DMA mapping stuff.
65972 + * i386 has everything mapped statically.
65973 + */
65974 +
65975 +#include <linux/types.h>
65976 +#include <linux/slab.h>
65977 +#include <asm/scatterlist.h>
65978 +#include <linux/string.h>
65979 +#include <asm/io.h>
65980 +
65981 +struct pci_dev;
65982 +
65983 +#ifdef CONFIG_SWIOTLB
65984 +
65985 +
65986 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
65987 +#define PCI_DMA_BUS_IS_PHYS    (0)
65988 +
65989 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
65990 +       dma_addr_t ADDR_NAME;
65991 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
65992 +       __u32 LEN_NAME;
65993 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
65994 +       ((PTR)->ADDR_NAME)
65995 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
65996 +       (((PTR)->ADDR_NAME) = (VAL))
65997 +#define pci_unmap_len(PTR, LEN_NAME)                   \
65998 +       ((PTR)->LEN_NAME)
65999 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
66000 +       (((PTR)->LEN_NAME) = (VAL))
66001 +
66002 +#else
66003 +
66004 +/* The PCI address space does equal the physical memory
66005 + * address space.  The networking and block device layers use
66006 + * this boolean for bounce buffer decisions.
66007 + */
66008 +#define PCI_DMA_BUS_IS_PHYS    (1)
66009 +
66010 +/* pci_unmap_{page,single} is a nop so... */
66011 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
66012 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
66013 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
66014 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
66015 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
66016 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
66017 +
66018 +#endif
66019 +
66020 +/* This is always fine. */
66021 +#define pci_dac_dma_supported(pci_dev, mask)   (1)
66022 +
66023 +static inline dma64_addr_t
66024 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
66025 +{
66026 +       return ((dma64_addr_t) page_to_phys(page) +
66027 +               (dma64_addr_t) offset);
66028 +}
66029 +
66030 +static inline struct page *
66031 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
66032 +{
66033 +       return pfn_to_page(dma_addr >> PAGE_SHIFT);
66034 +}
66035 +
66036 +static inline unsigned long
66037 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
66038 +{
66039 +       return (dma_addr & ~PAGE_MASK);
66040 +}
66041 +
66042 +static inline void
66043 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
66044 +{
66045 +}
66046 +
66047 +static inline void
66048 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
66049 +{
66050 +       flush_write_buffers();
66051 +}
66052 +
66053 +#define HAVE_PCI_MMAP
66054 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
66055 +                              enum pci_mmap_state mmap_state, int write_combine);
66056 +
66057 +
66058 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
66059 +{
66060 +}
66061 +
66062 +#ifdef CONFIG_PCI
66063 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
66064 +                                       enum pci_dma_burst_strategy *strat,
66065 +                                       unsigned long *strategy_parameter)
66066 +{
66067 +       *strat = PCI_DMA_BURST_INFINITY;
66068 +       *strategy_parameter = ~0UL;
66069 +}
66070 +#endif
66071 +
66072 +#endif /* __KERNEL__ */
66073 +
66074 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
66075 +#include <xen/pcifront.h>
66076 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
66077 +
66078 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
66079 +#include <asm-generic/pci-dma-compat.h>
66080 +
66081 +/* generic pci stuff */
66082 +#include <asm-generic/pci.h>
66083 +
66084 +/* On Xen we have to scan all functions since Xen hides bridges from
66085 + * us.  If a bridge is at fn=0 and that slot has a multifunction
66086 + * device, we won't find the additional devices without scanning all
66087 + * functions. */
66088 +#undef pcibios_scan_all_fns
66089 +#define pcibios_scan_all_fns(a, b)     1
66090 +
66091 +#endif /* __i386_PCI_H */
66092 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgalloc.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgalloc.h
66093 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgalloc.h  1970-01-01 01:00:00.000000000 +0100
66094 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgalloc.h     2006-04-10 00:05:52.000000000 +0200
66095 @@ -0,0 +1,64 @@
66096 +#ifndef _I386_PGALLOC_H
66097 +#define _I386_PGALLOC_H
66098 +
66099 +#include <linux/config.h>
66100 +#include <asm/fixmap.h>
66101 +#include <linux/threads.h>
66102 +#include <linux/mm.h>          /* for struct page */
66103 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
66104 +
66105 +/* Is this pagetable pinned? */
66106 +#define PG_pinned      PG_arch_1
66107 +
66108 +#define pmd_populate_kernel(mm, pmd, pte) \
66109 +               set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
66110 +
66111 +#define pmd_populate(mm, pmd, pte)                                     \
66112 +do {                                                                   \
66113 +       if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {     \
66114 +               if (!PageHighMem(pte))                                  \
66115 +                       BUG_ON(HYPERVISOR_update_va_mapping(            \
66116 +                         (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
66117 +                         pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));\
66118 +               set_pmd(pmd, __pmd(_PAGE_TABLE +                        \
66119 +                       ((unsigned long long)page_to_pfn(pte) <<        \
66120 +                               (unsigned long long) PAGE_SHIFT)));     \
66121 +       } else {                                                        \
66122 +               *(pmd) = __pmd(_PAGE_TABLE +                            \
66123 +                       ((unsigned long long)page_to_pfn(pte) <<        \
66124 +                               (unsigned long long) PAGE_SHIFT));      \
66125 +       }                                                               \
66126 +} while (0)
66127 +
66128 +/*
66129 + * Allocate and free page tables.
66130 + */
66131 +extern pgd_t *pgd_alloc(struct mm_struct *);
66132 +extern void pgd_free(pgd_t *pgd);
66133 +
66134 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
66135 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
66136 +
66137 +static inline void pte_free_kernel(pte_t *pte)
66138 +{
66139 +       free_page((unsigned long)pte);
66140 +       make_page_writable(pte, XENFEAT_writable_page_tables);
66141 +}
66142 +
66143 +extern void pte_free(struct page *pte);
66144 +
66145 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
66146 +
66147 +#ifdef CONFIG_X86_PAE
66148 +/*
66149 + * In the PAE case we free the pmds as part of the pgd.
66150 + */
66151 +#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
66152 +#define pmd_free(x)                    do { } while (0)
66153 +#define __pmd_free_tlb(tlb,x)          do { } while (0)
66154 +#define pud_populate(mm, pmd, pte)     BUG()
66155 +#endif
66156 +
66157 +#define check_pgt_cache()      do { } while (0)
66158 +
66159 +#endif /* _I386_PGALLOC_H */
66160 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h
66161 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h      1970-01-01 01:00:00.000000000 +0100
66162 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h 2006-04-10 00:05:52.000000000 +0200
66163 @@ -0,0 +1,21 @@
66164 +#ifndef _I386_PGTABLE_2LEVEL_DEFS_H
66165 +#define _I386_PGTABLE_2LEVEL_DEFS_H
66166 +
66167 +#define HAVE_SHARED_KERNEL_PMD 0
66168 +
66169 +/*
66170 + * traditional i386 two-level paging structure:
66171 + */
66172 +
66173 +#define PGDIR_SHIFT    22
66174 +#define PTRS_PER_PGD   1024
66175 +#define PTRS_PER_PGD_NO_HV     (HYPERVISOR_VIRT_START >> PGDIR_SHIFT)
66176 +
66177 +/*
66178 + * the i386 is two-level, so we don't really have any
66179 + * PMD directory physically.
66180 + */
66181 +
66182 +#define PTRS_PER_PTE   1024
66183 +
66184 +#endif /* _I386_PGTABLE_2LEVEL_DEFS_H */
66185 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-2level.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-2level.h
66186 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-2level.h   1970-01-01 01:00:00.000000000 +0100
66187 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-2level.h      2006-04-10 00:05:52.000000000 +0200
66188 @@ -0,0 +1,83 @@
66189 +#ifndef _I386_PGTABLE_2LEVEL_H
66190 +#define _I386_PGTABLE_2LEVEL_H
66191 +
66192 +#include <asm-generic/pgtable-nopmd.h>
66193 +
66194 +#define pte_ERROR(e) \
66195 +       printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
66196 +#define pgd_ERROR(e) \
66197 +       printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
66198 +
66199 +/*
66200 + * Certain architectures need to do special things when PTEs
66201 + * within a page table are directly modified.  Thus, the following
66202 + * hook is made available.
66203 + */
66204 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
66205 +
66206 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
66207 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66208 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
66209 +               set_pte((ptep), (pteval));                              \
66210 +} while (0)
66211 +
66212 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do {                     \
66213 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66214 +           HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
66215 +               set_pte((ptep), (pteval));                              \
66216 +               xen_invlpg((addr));                                     \
66217 +       }                                                               \
66218 +} while (0)
66219 +
66220 +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
66221 +
66222 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
66223 +
66224 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0))
66225 +#define pte_same(a, b)         ((a).pte_low == (b).pte_low)
66226 +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
66227 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
66228 +
66229 +#define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
66230 +
66231 +#define pte_none(x)            (!(x).pte_low)
66232 +#define pfn_pte(pfn, prot)     __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
66233 +#define pfn_pte_ma(pfn, prot)  __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
66234 +#define pfn_pmd(pfn, prot)     __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
66235 +
66236 +/*
66237 + * All present user pages are user-executable:
66238 + */
66239 +static inline int pte_exec(pte_t pte)
66240 +{
66241 +       return pte_user(pte);
66242 +}
66243 +
66244 +/*
66245 + * All present pages are kernel-executable:
66246 + */
66247 +static inline int pte_exec_kernel(pte_t pte)
66248 +{
66249 +       return 1;
66250 +}
66251 +
66252 +/*
66253 + * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
66254 + * into this range:
66255 + */
66256 +#define PTE_FILE_MAX_BITS      29
66257 +
66258 +#define pte_to_pgoff(pte) \
66259 +       ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
66260 +
66261 +#define pgoff_to_pte(off) \
66262 +       ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
66263 +
66264 +/* Encode and de-code a swap entry */
66265 +#define __swp_type(x)                  (((x).val >> 1) & 0x1f)
66266 +#define __swp_offset(x)                        ((x).val >> 8)
66267 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
66268 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { (pte).pte_low })
66269 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
66270 +
66271 +#endif /* _I386_PGTABLE_2LEVEL_H */
66272 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h
66273 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h      1970-01-01 01:00:00.000000000 +0100
66274 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h 2006-04-10 00:05:52.000000000 +0200
66275 @@ -0,0 +1,25 @@
66276 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
66277 +#define _I386_PGTABLE_3LEVEL_DEFS_H
66278 +
66279 +#define HAVE_SHARED_KERNEL_PMD 0
66280 +
66281 +/*
66282 + * PGDIR_SHIFT determines what a top-level page table entry can map
66283 + */
66284 +#define PGDIR_SHIFT    30
66285 +#define PTRS_PER_PGD   4
66286 +#define PTRS_PER_PGD_NO_HV 4
66287 +
66288 +/*
66289 + * PMD_SHIFT determines the size of the area a middle-level
66290 + * page table can map
66291 + */
66292 +#define PMD_SHIFT      21
66293 +#define PTRS_PER_PMD   512
66294 +
66295 +/*
66296 + * entries per page directory level
66297 + */
66298 +#define PTRS_PER_PTE   512
66299 +
66300 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
66301 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-3level.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-3level.h
66302 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable-3level.h   1970-01-01 01:00:00.000000000 +0100
66303 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable-3level.h      2006-04-10 00:05:52.000000000 +0200
66304 @@ -0,0 +1,181 @@
66305 +#ifndef _I386_PGTABLE_3LEVEL_H
66306 +#define _I386_PGTABLE_3LEVEL_H
66307 +
66308 +#include <asm-generic/pgtable-nopud.h>
66309 +
66310 +/*
66311 + * Intel Physical Address Extension (PAE) Mode - three-level page
66312 + * tables on PPro+ CPUs.
66313 + *
66314 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
66315 + */
66316 +
66317 +#define pte_ERROR(e) \
66318 +       printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
66319 +#define pmd_ERROR(e) \
66320 +       printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
66321 +#define pgd_ERROR(e) \
66322 +       printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
66323 +
66324 +#define pud_none(pud)                          0
66325 +#define pud_bad(pud)                           0
66326 +#define pud_present(pud)                       1
66327 +
66328 +/*
66329 + * Is the pte executable?
66330 + */
66331 +static inline int pte_x(pte_t pte)
66332 +{
66333 +       return !(pte_val(pte) & _PAGE_NX);
66334 +}
66335 +
66336 +/*
66337 + * All present user-pages with !NX bit are user-executable:
66338 + */
66339 +static inline int pte_exec(pte_t pte)
66340 +{
66341 +       return pte_user(pte) && pte_x(pte);
66342 +}
66343 +/*
66344 + * All present pages with !NX bit are kernel-executable:
66345 + */
66346 +static inline int pte_exec_kernel(pte_t pte)
66347 +{
66348 +       return pte_x(pte);
66349 +}
66350 +
66351 +/* Rules for using set_pte: the pte being assigned *must* be
66352 + * either not present or in a state where the hardware will
66353 + * not attempt to update the pte.  In places where this is
66354 + * not possible, use pte_get_and_clear to obtain the old pte
66355 + * value and then use set_pte to update it.  -ben
66356 + */
66357 +#define __HAVE_ARCH_SET_PTE_ATOMIC
66358 +
66359 +#if 1
66360 +/* use writable pagetables */
66361 +static inline void set_pte(pte_t *ptep, pte_t pte)
66362 +{
66363 +       ptep->pte_high = pte.pte_high;
66364 +       smp_wmb();
66365 +       ptep->pte_low = pte.pte_low;
66366 +}
66367 +# define set_pte_atomic(pteptr,pteval) \
66368 +               set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval))
66369 +#else
66370 +/* no writable pagetables */
66371 +# define set_pte(pteptr,pteval)                                \
66372 +               xen_l1_entry_update((pteptr), (pteval))
66373 +# define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval)
66374 +#endif
66375 +
66376 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
66377 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66378 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
66379 +               set_pte((ptep), (pteval));                              \
66380 +} while (0)
66381 +
66382 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do {                     \
66383 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66384 +           HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
66385 +               set_pte((ptep), (pteval));                              \
66386 +               xen_invlpg((addr));                                     \
66387 +       }                                                               \
66388 +} while (0)
66389 +
66390 +#define set_pmd(pmdptr,pmdval)                         \
66391 +               xen_l2_entry_update((pmdptr), (pmdval))
66392 +#define set_pud(pudptr,pudval) \
66393 +               xen_l3_entry_update((pudptr), (pudval))
66394 +
66395 +/*
66396 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
66397 + * the TLB via cr3 if the top-level pgd is changed...
66398 + * We do not let the generic code free and clear pgd entries due to
66399 + * this erratum.
66400 + */
66401 +static inline void pud_clear (pud_t * pud) { }
66402 +
66403 +#define pud_page(pud) \
66404 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
66405 +
66406 +#define pud_page_kernel(pud) \
66407 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
66408 +
66409 +
66410 +/* Find an entry in the second-level page table.. */
66411 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
66412 +                       pmd_index(address))
66413 +
66414 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
66415 +{
66416 +       pte_t res;
66417 +
66418 +       /* xchg acts as a barrier before the setting of the high bits */
66419 +       res.pte_low = xchg(&ptep->pte_low, 0);
66420 +       res.pte_high = ptep->pte_high;
66421 +       ptep->pte_high = 0;
66422 +
66423 +       return res;
66424 +}
66425 +
66426 +static inline int pte_same(pte_t a, pte_t b)
66427 +{
66428 +       return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
66429 +}
66430 +
66431 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
66432 +
66433 +static inline int pte_none(pte_t pte)
66434 +{
66435 +       return !pte.pte_low && !pte.pte_high;
66436 +}
66437 +
66438 +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
66439 +                      (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
66440 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
66441 +
66442 +extern unsigned long long __supported_pte_mask;
66443 +
66444 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
66445 +{
66446 +       pte_t pte;
66447 +
66448 +       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
66449 +                                       (pgprot_val(pgprot) >> 32);
66450 +       pte.pte_high &= (__supported_pte_mask >> 32);
66451 +       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
66452 +                                                       __supported_pte_mask;
66453 +       return pte;
66454 +}
66455 +
66456 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
66457 +{
66458 +       return pfn_pte_ma(pfn_to_mfn(page_nr), pgprot);
66459 +}
66460 +
66461 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
66462 +{
66463 +       BUG(); panic("needs review");
66464 +       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \
66465 +                       pgprot_val(pgprot)) & __supported_pte_mask);
66466 +}
66467 +
66468 +/*
66469 + * Bits 0, 6 and 7 are taken in the low part of the pte,
66470 + * put the 32 bits of offset into the high part.
66471 + */
66472 +#define pte_to_pgoff(pte) ((pte).pte_high)
66473 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
66474 +#define PTE_FILE_MAX_BITS       32
66475 +
66476 +/* Encode and de-code a swap entry */
66477 +#define __swp_type(x)                  (((x).val) & 0x1f)
66478 +#define __swp_offset(x)                        ((x).val >> 5)
66479 +#define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
66480 +#define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
66481 +#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
66482 +
66483 +#define __pmd_free_tlb(tlb, x)         do { } while (0)
66484 +
66485 +#endif /* _I386_PGTABLE_3LEVEL_H */
66486 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable.h
66487 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/pgtable.h  1970-01-01 01:00:00.000000000 +0100
66488 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/pgtable.h     2006-04-10 00:05:52.000000000 +0200
66489 @@ -0,0 +1,521 @@
66490 +#ifndef _I386_PGTABLE_H
66491 +#define _I386_PGTABLE_H
66492 +
66493 +#include <linux/config.h>
66494 +#include <asm/hypervisor.h>
66495 +
66496 +/*
66497 + * The Linux memory management assumes a three-level page table setup. On
66498 + * the i386, we use that, but "fold" the mid level into the top-level page
66499 + * table, so that we physically have the same two-level page table as the
66500 + * i386 mmu expects.
66501 + *
66502 + * This file contains the functions and defines necessary to modify and use
66503 + * the i386 page table tree.
66504 + */
66505 +#ifndef __ASSEMBLY__
66506 +#include <asm/processor.h>
66507 +#include <asm/fixmap.h>
66508 +#include <linux/threads.h>
66509 +
66510 +#ifndef _I386_BITOPS_H
66511 +#include <asm/bitops.h>
66512 +#endif
66513 +
66514 +#include <linux/slab.h>
66515 +#include <linux/list.h>
66516 +#include <linux/spinlock.h>
66517 +
66518 +struct mm_struct;
66519 +struct vm_area_struct;
66520 +
66521 +/*
66522 + * ZERO_PAGE is a global shared page that is always zero: used
66523 + * for zero-mapped memory areas etc..
66524 + */
66525 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
66526 +extern unsigned long empty_zero_page[1024];
66527 +extern pgd_t *swapper_pg_dir;
66528 +extern kmem_cache_t *pgd_cache;
66529 +extern kmem_cache_t *pmd_cache;
66530 +extern spinlock_t pgd_lock;
66531 +extern struct page *pgd_list;
66532 +
66533 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
66534 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
66535 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
66536 +void pgtable_cache_init(void);
66537 +void paging_init(void);
66538 +
66539 +/*
66540 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
66541 + * implements both the traditional 2-level x86 page tables and the
66542 + * newer 3-level PAE-mode page tables.
66543 + */
66544 +#ifdef CONFIG_X86_PAE
66545 +# include <asm/pgtable-3level-defs.h>
66546 +# define PMD_SIZE      (1UL << PMD_SHIFT)
66547 +# define PMD_MASK      (~(PMD_SIZE-1))
66548 +#else
66549 +# include <asm/pgtable-2level-defs.h>
66550 +#endif
66551 +
66552 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
66553 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
66554 +
66555 +#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
66556 +#define FIRST_USER_ADDRESS     0
66557 +
66558 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
66559 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
66560 +
66561 +#define TWOLEVEL_PGDIR_SHIFT   22
66562 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
66563 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
66564 +
66565 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
66566 + * current 8MB value just means that there will be a 8MB "hole" after the
66567 + * physical memory until the kernel virtual memory starts.  That means that
66568 + * any out-of-bounds memory accesses will hopefully be caught.
66569 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
66570 + * area for the same reason. ;)
66571 + */
66572 +#define VMALLOC_OFFSET (8*1024*1024)
66573 +#define VMALLOC_START  (((unsigned long) high_memory + vmalloc_earlyreserve + \
66574 +                       2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
66575 +#ifdef CONFIG_HIGHMEM
66576 +# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
66577 +#else
66578 +# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
66579 +#endif
66580 +
66581 +/*
66582 + * _PAGE_PSE set in the page directory entry just means that
66583 + * the page directory entry points directly to a 4MB-aligned block of
66584 + * memory. 
66585 + */
66586 +#define _PAGE_BIT_PRESENT      0
66587 +#define _PAGE_BIT_RW           1
66588 +#define _PAGE_BIT_USER         2
66589 +#define _PAGE_BIT_PWT          3
66590 +#define _PAGE_BIT_PCD          4
66591 +#define _PAGE_BIT_ACCESSED     5
66592 +#define _PAGE_BIT_DIRTY                6
66593 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
66594 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
66595 +#define _PAGE_BIT_UNUSED1      9       /* available for programmer */
66596 +#define _PAGE_BIT_UNUSED2      10
66597 +#define _PAGE_BIT_UNUSED3      11
66598 +#define _PAGE_BIT_NX           63
66599 +
66600 +#define _PAGE_PRESENT  0x001
66601 +#define _PAGE_RW       0x002
66602 +#define _PAGE_USER     0x004
66603 +#define _PAGE_PWT      0x008
66604 +#define _PAGE_PCD      0x010
66605 +#define _PAGE_ACCESSED 0x020
66606 +#define _PAGE_DIRTY    0x040
66607 +#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
66608 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
66609 +#define _PAGE_UNUSED1  0x200   /* available for programmer */
66610 +#define _PAGE_UNUSED2  0x400
66611 +#define _PAGE_UNUSED3  0x800
66612 +
66613 +/* If _PAGE_PRESENT is clear, we use these: */
66614 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
66615 +#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
66616 +                                  pte_present gives true */
66617 +#ifdef CONFIG_X86_PAE
66618 +#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
66619 +#else
66620 +#define _PAGE_NX       0
66621 +#endif
66622 +
66623 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
66624 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
66625 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
66626 +
66627 +#define PAGE_NONE \
66628 +       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
66629 +#define PAGE_SHARED \
66630 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
66631 +
66632 +#define PAGE_SHARED_EXEC \
66633 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
66634 +#define PAGE_COPY_NOEXEC \
66635 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
66636 +#define PAGE_COPY_EXEC \
66637 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
66638 +#define PAGE_COPY \
66639 +       PAGE_COPY_NOEXEC
66640 +#define PAGE_READONLY \
66641 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
66642 +#define PAGE_READONLY_EXEC \
66643 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
66644 +
66645 +#define _PAGE_KERNEL \
66646 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
66647 +#define _PAGE_KERNEL_EXEC \
66648 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
66649 +
66650 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
66651 +#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
66652 +#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
66653 +#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
66654 +#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
66655 +
66656 +#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
66657 +#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
66658 +#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
66659 +#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
66660 +#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
66661 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
66662 +
66663 +/*
66664 + * The i386 can't do page protection for execute, and considers that
66665 + * the same are read. Also, write permissions imply read permissions.
66666 + * This is the closest we can get..
66667 + */
66668 +#define __P000 PAGE_NONE
66669 +#define __P001 PAGE_READONLY
66670 +#define __P010 PAGE_COPY
66671 +#define __P011 PAGE_COPY
66672 +#define __P100 PAGE_READONLY_EXEC
66673 +#define __P101 PAGE_READONLY_EXEC
66674 +#define __P110 PAGE_COPY_EXEC
66675 +#define __P111 PAGE_COPY_EXEC
66676 +
66677 +#define __S000 PAGE_NONE
66678 +#define __S001 PAGE_READONLY
66679 +#define __S010 PAGE_SHARED
66680 +#define __S011 PAGE_SHARED
66681 +#define __S100 PAGE_READONLY_EXEC
66682 +#define __S101 PAGE_READONLY_EXEC
66683 +#define __S110 PAGE_SHARED_EXEC
66684 +#define __S111 PAGE_SHARED_EXEC
66685 +
66686 +/*
66687 + * Define this if things work differently on an i386 and an i486:
66688 + * it will (on an i486) warn about kernel memory accesses that are
66689 + * done without a 'access_ok(VERIFY_WRITE,..)'
66690 + */
66691 +#undef TEST_ACCESS_OK
66692 +
66693 +/* The boot page tables (all created as a single array) */
66694 +extern unsigned long pg0[];
66695 +
66696 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
66697 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
66698 +
66699 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
66700 +#define pmd_none(x)    (!(unsigned long)pmd_val(x))
66701 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
66702 +   can temporarily clear it. */
66703 +#define pmd_present(x) (pmd_val(x))
66704 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
66705 +#define pmd_bad(x)     ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
66706 +
66707 +
66708 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
66709 +
66710 +/*
66711 + * The following only work if pte_present() is true.
66712 + * Undefined behaviour if not..
66713 + */
66714 +#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
66715 +static inline int pte_user(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
66716 +static inline int pte_read(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
66717 +static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
66718 +static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
66719 +static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
66720 +static inline int pte_huge(pte_t pte)          { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
66721 +
66722 +/*
66723 + * The following only works if pte_present() is not true.
66724 + */
66725 +static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
66726 +
66727 +static inline pte_t pte_rdprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
66728 +static inline pte_t pte_exprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
66729 +static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
66730 +static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
66731 +static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
66732 +static inline pte_t pte_mkread(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
66733 +static inline pte_t pte_mkexec(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
66734 +static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
66735 +static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
66736 +static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
66737 +static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= __LARGE_PTE; return pte; }
66738 +
66739 +#ifdef CONFIG_X86_PAE
66740 +# include <asm/pgtable-3level.h>
66741 +#else
66742 +# include <asm/pgtable-2level.h>
66743 +#endif
66744 +
66745 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
66746 +{
66747 +       if (!pte_dirty(*ptep))
66748 +               return 0;
66749 +       return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
66750 +}
66751 +
66752 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
66753 +{
66754 +       if (!pte_young(*ptep))
66755 +               return 0;
66756 +       return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
66757 +}
66758 +
66759 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
66760 +{
66761 +       pte_t pte;
66762 +       if (full) {
66763 +               pte = *ptep;
66764 +#ifdef CONFIG_X86_PAE
66765 +               /* Cannot do this in a single step, as the compiler may
66766 +                  issue the two stores in either order, but the hypervisor
66767 +                  must not see the high part before the low one. */
66768 +               ptep->pte_low = 0;
66769 +               barrier();
66770 +               ptep->pte_high = 0;
66771 +#else
66772 +               *ptep = __pte(0);
66773 +#endif
66774 +       } else {
66775 +               pte = ptep_get_and_clear(mm, addr, ptep);
66776 +       }
66777 +       return pte;
66778 +}
66779 +
66780 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
66781 +{
66782 +       if (pte_write(*ptep))
66783 +               clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
66784 +}
66785 +
66786 +/*
66787 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
66788 + *
66789 + *  dst - pointer to pgd range anwhere on a pgd page
66790 + *  src - ""
66791 + *  count - the number of pgds to copy.
66792 + *
66793 + * dst and src can be on the same page, but the range must not overlap,
66794 + * and must not cross a page boundary.
66795 + */
66796 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
66797 +{
66798 +       memcpy(dst, src, count * sizeof(pgd_t));
66799 +}
66800 +
66801 +/*
66802 + * Macro to mark a page protection value as "uncacheable".  On processors which do not support
66803 + * it, this is a no-op.
66804 + */
66805 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3)                                          \
66806 +                                ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
66807 +
66808 +/*
66809 + * Conversion functions: convert a page and protection to a page entry,
66810 + * and a page entry and page directory to the page they refer to.
66811 + */
66812 +
66813 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
66814 +
66815 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
66816 +{
66817 +       pte.pte_low &= _PAGE_CHG_MASK;
66818 +       pte.pte_low |= pgprot_val(newprot);
66819 +#ifdef CONFIG_X86_PAE
66820 +       /*
66821 +        * Chop off the NX bit (if present), and add the NX portion of
66822 +        * the newprot (if present):
66823 +        */
66824 +       pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
66825 +       pte.pte_high |= (pgprot_val(newprot) >> 32) & \
66826 +                                       (__supported_pte_mask >> 32);
66827 +#endif
66828 +       return pte;
66829 +}
66830 +
66831 +#define pmd_large(pmd) \
66832 +((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
66833 +
66834 +/*
66835 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
66836 + *
66837 + * this macro returns the index of the entry in the pgd page which would
66838 + * control the given virtual address
66839 + */
66840 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
66841 +#define pgd_index_k(addr) pgd_index(addr)
66842 +
66843 +/*
66844 + * pgd_offset() returns a (pgd_t *)
66845 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
66846 + */
66847 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
66848 +
66849 +/*
66850 + * a shortcut which implies the use of the kernel's pgd, instead
66851 + * of a process's
66852 + */
66853 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
66854 +
66855 +/*
66856 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
66857 + *
66858 + * this macro returns the index of the entry in the pmd page which would
66859 + * control the given virtual address
66860 + */
66861 +#define pmd_index(address) \
66862 +               (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
66863 +
66864 +/*
66865 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
66866 + *
66867 + * this macro returns the index of the entry in the pte page which would
66868 + * control the given virtual address
66869 + */
66870 +#define pte_index(address) \
66871 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
66872 +#define pte_offset_kernel(dir, address) \
66873 +       ((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
66874 +
66875 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
66876 +
66877 +#define pmd_page_kernel(pmd) \
66878 +               ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
66879 +
66880 +/*
66881 + * Helper function that returns the kernel pagetable entry controlling
66882 + * the virtual address 'address'. NULL means no pagetable entry present.
66883 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
66884 + * as a pte too.
66885 + */
66886 +extern pte_t *lookup_address(unsigned long address);
66887 +
66888 +/*
66889 + * Make a given kernel text page executable/non-executable.
66890 + * Returns the previous executability setting of that page (which
66891 + * is used to restore the previous state). Used by the SMP bootup code.
66892 + * NOTE: this is an __init function for security reasons.
66893 + */
66894 +#ifdef CONFIG_X86_PAE
66895 + extern int set_kernel_exec(unsigned long vaddr, int enable);
66896 +#else
66897 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
66898 +#endif
66899 +
66900 +extern void noexec_setup(const char *str);
66901 +
66902 +#if defined(CONFIG_HIGHPTE)
66903 +#define pte_offset_map(dir, address) \
66904 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
66905 +        pte_index(address))
66906 +#define pte_offset_map_nested(dir, address) \
66907 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
66908 +        pte_index(address))
66909 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
66910 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
66911 +#else
66912 +#define pte_offset_map(dir, address) \
66913 +       ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
66914 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
66915 +#define pte_unmap(pte) do { } while (0)
66916 +#define pte_unmap_nested(pte) do { } while (0)
66917 +#endif
66918 +
66919 +/*
66920 + * The i386 doesn't have any external MMU info: the kernel page
66921 + * tables contain all the necessary information.
66922 + *
66923 + * Also, we only update the dirty/accessed state if we set
66924 + * the dirty bit by hand in the kernel, since the hardware
66925 + * will do the accessed bit for us, and we don't want to
66926 + * race with other CPU's that might be updating the dirty
66927 + * bit at the same time.
66928 + */
66929 +#define update_mmu_cache(vma,address,pte) do { } while (0)
66930 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
66931 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
66932 +       do {                                                              \
66933 +               if (__dirty) {                                            \
66934 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
66935 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
66936 +                       } else {                                          \
66937 +                            xen_l1_entry_update((__ptep), (__entry)); \
66938 +                           flush_tlb_page((__vma), (__address));         \
66939 +                       }                                                 \
66940 +               }                                                         \
66941 +       } while (0)
66942 +
66943 +#define __HAVE_ARCH_PTEP_ESTABLISH
66944 +#define ptep_establish(__vma, __address, __ptep, __entry)              \
66945 +do {                                                                   \
66946 +       ptep_set_access_flags(__vma, __address, __ptep, __entry, 1);    \
66947 +} while (0)
66948 +
66949 +#include <xen/features.h>
66950 +void make_lowmem_page_readonly(void *va, unsigned int feature);
66951 +void make_lowmem_page_writable(void *va, unsigned int feature);
66952 +void make_page_readonly(void *va, unsigned int feature);
66953 +void make_page_writable(void *va, unsigned int feature);
66954 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
66955 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
66956 +
66957 +#define virt_to_ptep(__va)                                             \
66958 +({                                                                     \
66959 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
66960 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
66961 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
66962 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
66963 +})
66964 +
66965 +#define arbitrary_virt_to_machine(__va)                                        \
66966 +({                                                                     \
66967 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
66968 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
66969 +})
66970 +
66971 +#endif /* !__ASSEMBLY__ */
66972 +
66973 +#ifdef CONFIG_FLATMEM
66974 +#define kern_addr_valid(addr)  (1)
66975 +#endif /* CONFIG_FLATMEM */
66976 +
66977 +int direct_remap_pfn_range(struct vm_area_struct *vma,
66978 +                           unsigned long address, 
66979 +                           unsigned long mfn,
66980 +                           unsigned long size, 
66981 +                           pgprot_t prot,
66982 +                           domid_t  domid);
66983 +int direct_kernel_remap_pfn_range(unsigned long address, 
66984 +                                 unsigned long mfn,
66985 +                                 unsigned long size, 
66986 +                                 pgprot_t prot,
66987 +                                 domid_t  domid);
66988 +int create_lookup_pte_addr(struct mm_struct *mm,
66989 +                           unsigned long address,
66990 +                           uint64_t *ptep);
66991 +int touch_pte_range(struct mm_struct *mm,
66992 +                    unsigned long address,
66993 +                    unsigned long size);
66994 +
66995 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
66996 +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
66997 +
66998 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
66999 +#define GET_IOSPACE(pfn)               0
67000 +#define GET_PFN(pfn)                   (pfn)
67001 +
67002 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
67003 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
67004 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
67005 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
67006 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
67007 +#define __HAVE_ARCH_PTE_SAME
67008 +#include <asm-generic/pgtable.h>
67009 +
67010 +#endif /* _I386_PGTABLE_H */
67011 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/processor.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/processor.h
67012 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/processor.h        1970-01-01 01:00:00.000000000 +0100
67013 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/processor.h   2006-04-10 00:05:52.000000000 +0200
67014 @@ -0,0 +1,753 @@
67015 +/*
67016 + * include/asm-i386/processor.h
67017 + *
67018 + * Copyright (C) 1994 Linus Torvalds
67019 + */
67020 +
67021 +#ifndef __ASM_I386_PROCESSOR_H
67022 +#define __ASM_I386_PROCESSOR_H
67023 +
67024 +#include <asm/vm86.h>
67025 +#include <asm/math_emu.h>
67026 +#include <asm/segment.h>
67027 +#include <asm/page.h>
67028 +#include <asm/types.h>
67029 +#include <asm/sigcontext.h>
67030 +#include <asm/cpufeature.h>
67031 +#include <asm/msr.h>
67032 +#include <asm/system.h>
67033 +#include <linux/cache.h>
67034 +#include <linux/config.h>
67035 +#include <linux/threads.h>
67036 +#include <asm/percpu.h>
67037 +#include <xen/interface/physdev.h>
67038 +
67039 +/* flag for disabling the tsc */
67040 +extern int tsc_disable;
67041 +
67042 +struct desc_struct {
67043 +       unsigned long a,b;
67044 +};
67045 +
67046 +#define desc_empty(desc) \
67047 +               (!((desc)->a | (desc)->b))
67048 +
67049 +#define desc_equal(desc1, desc2) \
67050 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
67051 +/*
67052 + * Default implementation of macro that returns current
67053 + * instruction pointer ("program counter").
67054 + */
67055 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
67056 +
67057 +/*
67058 + *  CPU type and hardware bug flags. Kept separately for each CPU.
67059 + *  Members of this structure are referenced in head.S, so think twice
67060 + *  before touching them. [mj]
67061 + */
67062 +
67063 +struct cpuinfo_x86 {
67064 +       __u8    x86;            /* CPU family */
67065 +       __u8    x86_vendor;     /* CPU vendor */
67066 +       __u8    x86_model;
67067 +       __u8    x86_mask;
67068 +       char    wp_works_ok;    /* It doesn't on 386's */
67069 +       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
67070 +       char    hard_math;
67071 +       char    rfu;
67072 +               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
67073 +       unsigned long   x86_capability[NCAPINTS];
67074 +       char    x86_vendor_id[16];
67075 +       char    x86_model_id[64];
67076 +       int     x86_cache_size;  /* in KB - valid for CPUS which support this
67077 +                                   call  */
67078 +       int     x86_cache_alignment;    /* In bytes */
67079 +       char    fdiv_bug;
67080 +       char    f00f_bug;
67081 +       char    coma_bug;
67082 +       char    pad0;
67083 +       int     x86_power;
67084 +       unsigned long loops_per_jiffy;
67085 +       unsigned char x86_max_cores;    /* cpuid returned max cores value */
67086 +       unsigned char booted_cores;     /* number of cores as seen by OS */
67087 +       unsigned char apicid;
67088 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
67089 +
67090 +#define X86_VENDOR_INTEL 0
67091 +#define X86_VENDOR_CYRIX 1
67092 +#define X86_VENDOR_AMD 2
67093 +#define X86_VENDOR_UMC 3
67094 +#define X86_VENDOR_NEXGEN 4
67095 +#define X86_VENDOR_CENTAUR 5
67096 +#define X86_VENDOR_RISE 6
67097 +#define X86_VENDOR_TRANSMETA 7
67098 +#define X86_VENDOR_NSC 8
67099 +#define X86_VENDOR_NUM 9
67100 +#define X86_VENDOR_UNKNOWN 0xff
67101 +
67102 +/*
67103 + * capabilities of CPUs
67104 + */
67105 +
67106 +extern struct cpuinfo_x86 boot_cpu_data;
67107 +extern struct cpuinfo_x86 new_cpu_data;
67108 +#ifndef CONFIG_X86_NO_TSS
67109 +extern struct tss_struct doublefault_tss;
67110 +DECLARE_PER_CPU(struct tss_struct, init_tss);
67111 +#endif
67112 +
67113 +#ifdef CONFIG_SMP
67114 +extern struct cpuinfo_x86 cpu_data[];
67115 +#define current_cpu_data cpu_data[smp_processor_id()]
67116 +#else
67117 +#define cpu_data (&boot_cpu_data)
67118 +#define current_cpu_data boot_cpu_data
67119 +#endif
67120 +
67121 +extern int phys_proc_id[NR_CPUS];
67122 +extern int cpu_core_id[NR_CPUS];
67123 +extern char ignore_fpu_irq;
67124 +
67125 +extern void identify_cpu(struct cpuinfo_x86 *);
67126 +extern void print_cpu_info(struct cpuinfo_x86 *);
67127 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
67128 +
67129 +#ifdef CONFIG_X86_HT
67130 +extern void detect_ht(struct cpuinfo_x86 *c);
67131 +#else
67132 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
67133 +#endif
67134 +
67135 +/*
67136 + * EFLAGS bits
67137 + */
67138 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
67139 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
67140 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
67141 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
67142 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
67143 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
67144 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
67145 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
67146 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
67147 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
67148 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
67149 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
67150 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
67151 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
67152 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
67153 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
67154 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
67155 +
67156 +/*
67157 + * Generic CPUID function
67158 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
67159 + * resulting in stale register contents being returned.
67160 + */
67161 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
67162 +{
67163 +       __asm__(XEN_CPUID
67164 +               : "=a" (*eax),
67165 +                 "=b" (*ebx),
67166 +                 "=c" (*ecx),
67167 +                 "=d" (*edx)
67168 +               : "0" (op), "c"(0));
67169 +}
67170 +
67171 +/* Some CPUID calls want 'count' to be placed in ecx */
67172 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
67173 +               int *edx)
67174 +{
67175 +       __asm__(XEN_CPUID
67176 +               : "=a" (*eax),
67177 +                 "=b" (*ebx),
67178 +                 "=c" (*ecx),
67179 +                 "=d" (*edx)
67180 +               : "0" (op), "c" (count));
67181 +}
67182 +
67183 +/*
67184 + * CPUID functions returning a single datum
67185 + */
67186 +static inline unsigned int cpuid_eax(unsigned int op)
67187 +{
67188 +       unsigned int eax;
67189 +
67190 +       __asm__(XEN_CPUID
67191 +               : "=a" (eax)
67192 +               : "0" (op)
67193 +               : "bx", "cx", "dx");
67194 +       return eax;
67195 +}
67196 +static inline unsigned int cpuid_ebx(unsigned int op)
67197 +{
67198 +       unsigned int eax, ebx;
67199 +
67200 +       __asm__(XEN_CPUID
67201 +               : "=a" (eax), "=b" (ebx)
67202 +               : "0" (op)
67203 +               : "cx", "dx" );
67204 +       return ebx;
67205 +}
67206 +static inline unsigned int cpuid_ecx(unsigned int op)
67207 +{
67208 +       unsigned int eax, ecx;
67209 +
67210 +       __asm__(XEN_CPUID
67211 +               : "=a" (eax), "=c" (ecx)
67212 +               : "0" (op)
67213 +               : "bx", "dx" );
67214 +       return ecx;
67215 +}
67216 +static inline unsigned int cpuid_edx(unsigned int op)
67217 +{
67218 +       unsigned int eax, edx;
67219 +
67220 +       __asm__(XEN_CPUID
67221 +               : "=a" (eax), "=d" (edx)
67222 +               : "0" (op)
67223 +               : "bx", "cx");
67224 +       return edx;
67225 +}
67226 +
67227 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
67228 +
67229 +/*
67230 + * Intel CPU features in CR4
67231 + */
67232 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
67233 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
67234 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
67235 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
67236 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
67237 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
67238 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
67239 +#define X86_CR4_PGE            0x0080  /* enable global pages */
67240 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
67241 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
67242 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
67243 +
67244 +/*
67245 + * Save the cr4 feature set we're using (ie
67246 + * Pentium 4MB enable and PPro Global page
67247 + * enable), so that any CPU's that boot up
67248 + * after us can get the correct flags.
67249 + */
67250 +extern unsigned long mmu_cr4_features;
67251 +
67252 +static inline void set_in_cr4 (unsigned long mask)
67253 +{
67254 +       unsigned cr4;
67255 +       mmu_cr4_features |= mask;
67256 +       cr4 = read_cr4();
67257 +       cr4 |= mask;
67258 +       write_cr4(cr4);
67259 +}
67260 +
67261 +static inline void clear_in_cr4 (unsigned long mask)
67262 +{
67263 +       unsigned cr4;
67264 +       mmu_cr4_features &= ~mask;
67265 +       cr4 = read_cr4();
67266 +       cr4 &= ~mask;
67267 +       write_cr4(cr4);
67268 +}
67269 +
67270 +/*
67271 + *      NSC/Cyrix CPU configuration register indexes
67272 + */
67273 +
67274 +#define CX86_PCR0 0x20
67275 +#define CX86_GCR  0xb8
67276 +#define CX86_CCR0 0xc0
67277 +#define CX86_CCR1 0xc1
67278 +#define CX86_CCR2 0xc2
67279 +#define CX86_CCR3 0xc3
67280 +#define CX86_CCR4 0xe8
67281 +#define CX86_CCR5 0xe9
67282 +#define CX86_CCR6 0xea
67283 +#define CX86_CCR7 0xeb
67284 +#define CX86_PCR1 0xf0
67285 +#define CX86_DIR0 0xfe
67286 +#define CX86_DIR1 0xff
67287 +#define CX86_ARR_BASE 0xc4
67288 +#define CX86_RCR_BASE 0xdc
67289 +
67290 +/*
67291 + *      NSC/Cyrix CPU indexed register access macros
67292 + */
67293 +
67294 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
67295 +
67296 +#define setCx86(reg, data) do { \
67297 +       outb((reg), 0x22); \
67298 +       outb((data), 0x23); \
67299 +} while (0)
67300 +
67301 +/* Stop speculative execution */
67302 +static inline void sync_core(void)
67303 +{
67304 +       int tmp;
67305 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
67306 +}
67307 +
67308 +static inline void __monitor(const void *eax, unsigned long ecx,
67309 +               unsigned long edx)
67310 +{
67311 +       /* "monitor %eax,%ecx,%edx;" */
67312 +       asm volatile(
67313 +               ".byte 0x0f,0x01,0xc8;"
67314 +               : :"a" (eax), "c" (ecx), "d"(edx));
67315 +}
67316 +
67317 +static inline void __mwait(unsigned long eax, unsigned long ecx)
67318 +{
67319 +       /* "mwait %eax,%ecx;" */
67320 +       asm volatile(
67321 +               ".byte 0x0f,0x01,0xc9;"
67322 +               : :"a" (eax), "c" (ecx));
67323 +}
67324 +
67325 +/* from system description table in BIOS.  Mostly for MCA use, but
67326 +others may find it useful. */
67327 +extern unsigned int machine_id;
67328 +extern unsigned int machine_submodel_id;
67329 +extern unsigned int BIOS_revision;
67330 +extern unsigned int mca_pentium_flag;
67331 +
67332 +/* Boot loader type from the setup header */
67333 +extern int bootloader_type;
67334 +
67335 +/*
67336 + * User space process size: 3GB (default).
67337 + */
67338 +#define TASK_SIZE      (PAGE_OFFSET)
67339 +
67340 +/* This decides where the kernel will search for a free chunk of vm
67341 + * space during mmap's.
67342 + */
67343 +#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
67344 +
67345 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
67346 +
67347 +/*
67348 + * Size of io_bitmap.
67349 + */
67350 +#define IO_BITMAP_BITS  65536
67351 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
67352 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
67353 +#ifndef CONFIG_X86_NO_TSS
67354 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
67355 +#endif
67356 +#define INVALID_IO_BITMAP_OFFSET 0x8000
67357 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
67358 +
67359 +struct i387_fsave_struct {
67360 +       long    cwd;
67361 +       long    swd;
67362 +       long    twd;
67363 +       long    fip;
67364 +       long    fcs;
67365 +       long    foo;
67366 +       long    fos;
67367 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
67368 +       long    status;         /* software status information */
67369 +};
67370 +
67371 +struct i387_fxsave_struct {
67372 +       unsigned short  cwd;
67373 +       unsigned short  swd;
67374 +       unsigned short  twd;
67375 +       unsigned short  fop;
67376 +       long    fip;
67377 +       long    fcs;
67378 +       long    foo;
67379 +       long    fos;
67380 +       long    mxcsr;
67381 +       long    mxcsr_mask;
67382 +       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
67383 +       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
67384 +       long    padding[56];
67385 +} __attribute__ ((aligned (16)));
67386 +
67387 +struct i387_soft_struct {
67388 +       long    cwd;
67389 +       long    swd;
67390 +       long    twd;
67391 +       long    fip;
67392 +       long    fcs;
67393 +       long    foo;
67394 +       long    fos;
67395 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
67396 +       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
67397 +       struct info     *info;
67398 +       unsigned long   entry_eip;
67399 +};
67400 +
67401 +union i387_union {
67402 +       struct i387_fsave_struct        fsave;
67403 +       struct i387_fxsave_struct       fxsave;
67404 +       struct i387_soft_struct soft;
67405 +};
67406 +
67407 +typedef struct {
67408 +       unsigned long seg;
67409 +} mm_segment_t;
67410 +
67411 +struct thread_struct;
67412 +
67413 +#ifndef CONFIG_X86_NO_TSS
67414 +struct tss_struct {
67415 +       unsigned short  back_link,__blh;
67416 +       unsigned long   esp0;
67417 +       unsigned short  ss0,__ss0h;
67418 +       unsigned long   esp1;
67419 +       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
67420 +       unsigned long   esp2;
67421 +       unsigned short  ss2,__ss2h;
67422 +       unsigned long   __cr3;
67423 +       unsigned long   eip;
67424 +       unsigned long   eflags;
67425 +       unsigned long   eax,ecx,edx,ebx;
67426 +       unsigned long   esp;
67427 +       unsigned long   ebp;
67428 +       unsigned long   esi;
67429 +       unsigned long   edi;
67430 +       unsigned short  es, __esh;
67431 +       unsigned short  cs, __csh;
67432 +       unsigned short  ss, __ssh;
67433 +       unsigned short  ds, __dsh;
67434 +       unsigned short  fs, __fsh;
67435 +       unsigned short  gs, __gsh;
67436 +       unsigned short  ldt, __ldth;
67437 +       unsigned short  trace, io_bitmap_base;
67438 +       /*
67439 +        * The extra 1 is there because the CPU will access an
67440 +        * additional byte beyond the end of the IO permission
67441 +        * bitmap. The extra byte must be all 1 bits, and must
67442 +        * be within the limit.
67443 +        */
67444 +       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
67445 +       /*
67446 +        * Cache the current maximum and the last task that used the bitmap:
67447 +        */
67448 +       unsigned long io_bitmap_max;
67449 +       struct thread_struct *io_bitmap_owner;
67450 +       /*
67451 +        * pads the TSS to be cacheline-aligned (size is 0x100)
67452 +        */
67453 +       unsigned long __cacheline_filler[35];
67454 +       /*
67455 +        * .. and then another 0x100 bytes for emergency kernel stack
67456 +        */
67457 +       unsigned long stack[64];
67458 +} __attribute__((packed));
67459 +#endif
67460 +
67461 +#define ARCH_MIN_TASKALIGN     16
67462 +
67463 +struct thread_struct {
67464 +/* cached TLS descriptors. */
67465 +       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
67466 +       unsigned long   esp0;
67467 +       unsigned long   sysenter_cs;
67468 +       unsigned long   eip;
67469 +       unsigned long   esp;
67470 +       unsigned long   fs;
67471 +       unsigned long   gs;
67472 +/* Hardware debugging registers */
67473 +       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
67474 +/* fault info */
67475 +       unsigned long   cr2, trap_no, error_code;
67476 +/* floating point info */
67477 +       union i387_union        i387;
67478 +/* virtual 86 mode info */
67479 +       struct vm86_struct __user * vm86_info;
67480 +       unsigned long           screen_bitmap;
67481 +       unsigned long           v86flags, v86mask, saved_esp0;
67482 +       unsigned int            saved_fs, saved_gs;
67483 +/* IO permissions */
67484 +       unsigned long   *io_bitmap_ptr;
67485 +       unsigned long   iopl;
67486 +/* max allowed port in the bitmap, in bytes: */
67487 +       unsigned long   io_bitmap_max;
67488 +};
67489 +
67490 +#define INIT_THREAD  {                                                 \
67491 +       .vm86_info = NULL,                                              \
67492 +       .sysenter_cs = __KERNEL_CS,                                     \
67493 +       .io_bitmap_ptr = NULL,                                          \
67494 +}
67495 +
67496 +#ifndef CONFIG_X86_NO_TSS
67497 +/*
67498 + * Note that the .io_bitmap member must be extra-big. This is because
67499 + * the CPU will access an additional byte beyond the end of the IO
67500 + * permission bitmap. The extra byte must be all 1 bits, and must
67501 + * be within the limit.
67502 + */
67503 +#define INIT_TSS  {                                                    \
67504 +       .esp0           = sizeof(init_stack) + (long)&init_stack,       \
67505 +       .ss0            = __KERNEL_DS,                                  \
67506 +       .ss1            = __KERNEL_CS,                                  \
67507 +       .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,                     \
67508 +       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
67509 +}
67510 +
67511 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
67512 +{
67513 +       tss->esp0 = thread->esp0;
67514 +#ifdef CONFIG_X86_SYSENTER
67515 +       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
67516 +       if (unlikely(tss->ss1 != thread->sysenter_cs)) {
67517 +               tss->ss1 = thread->sysenter_cs;
67518 +               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
67519 +       }
67520 +#endif
67521 +}
67522 +#define load_esp0(tss, thread) \
67523 +       __load_esp0(tss, thread)
67524 +#else
67525 +#define load_esp0(tss, thread) \
67526 +       HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)
67527 +#endif
67528 +
67529 +#define start_thread(regs, new_eip, new_esp) do {              \
67530 +       __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0));       \
67531 +       set_fs(USER_DS);                                        \
67532 +       regs->xds = __USER_DS;                                  \
67533 +       regs->xes = __USER_DS;                                  \
67534 +       regs->xss = __USER_DS;                                  \
67535 +       regs->xcs = __USER_CS;                                  \
67536 +       regs->eip = new_eip;                                    \
67537 +       regs->esp = new_esp;                                    \
67538 +} while (0)
67539 +
67540 +/*
67541 + * These special macros can be used to get or set a debugging register
67542 + */
67543 +#define get_debugreg(var, register)                            \
67544 +               (var) = HYPERVISOR_get_debugreg((register))
67545 +#define set_debugreg(value, register)                  \
67546 +               HYPERVISOR_set_debugreg((register), (value))
67547 +
67548 +/*
67549 + * Set IOPL bits in EFLAGS from given mask
67550 + */
67551 +static inline void set_iopl_mask(unsigned mask)
67552 +{
67553 +       physdev_op_t op;
67554 +
67555 +       /* Force the change at ring 0. */
67556 +       op.cmd = PHYSDEVOP_SET_IOPL;
67557 +       op.u.set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
67558 +       HYPERVISOR_physdev_op(&op);
67559 +}
67560 +
67561 +/* Forward declaration, a strange C thing */
67562 +struct task_struct;
67563 +struct mm_struct;
67564 +
67565 +/* Free all resources held by a thread. */
67566 +extern void release_thread(struct task_struct *);
67567 +
67568 +/* Prepare to copy thread state - unlazy all lazy status */
67569 +extern void prepare_to_copy(struct task_struct *tsk);
67570 +
67571 +/*
67572 + * create a kernel thread without removing it from tasklists
67573 + */
67574 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
67575 +
67576 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
67577 +void show_trace(struct task_struct *task, unsigned long *stack);
67578 +
67579 +unsigned long get_wchan(struct task_struct *p);
67580 +
67581 +#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
67582 +#define KSTK_TOP(info)                                                 \
67583 +({                                                                     \
67584 +       unsigned long *__ptr = (unsigned long *)(info);                 \
67585 +       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
67586 +})
67587 +
67588 +/*
67589 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
67590 + * This is necessary to guarantee that the entire "struct pt_regs"
67591 + * is accessable even if the CPU haven't stored the SS/ESP registers
67592 + * on the stack (interrupt gate does not save these registers
67593 + * when switching to the same priv ring).
67594 + * Therefore beware: accessing the xss/esp fields of the
67595 + * "struct pt_regs" is possible, but they may contain the
67596 + * completely wrong values.
67597 + */
67598 +#define task_pt_regs(task)                                             \
67599 +({                                                                     \
67600 +       struct pt_regs *__regs__;                                       \
67601 +       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
67602 +       __regs__ - 1;                                                   \
67603 +})
67604 +
67605 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
67606 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
67607 +
67608 +
67609 +struct microcode_header {
67610 +       unsigned int hdrver;
67611 +       unsigned int rev;
67612 +       unsigned int date;
67613 +       unsigned int sig;
67614 +       unsigned int cksum;
67615 +       unsigned int ldrver;
67616 +       unsigned int pf;
67617 +       unsigned int datasize;
67618 +       unsigned int totalsize;
67619 +       unsigned int reserved[3];
67620 +};
67621 +
67622 +struct microcode {
67623 +       struct microcode_header hdr;
67624 +       unsigned int bits[0];
67625 +};
67626 +
67627 +typedef struct microcode microcode_t;
67628 +typedef struct microcode_header microcode_header_t;
67629 +
67630 +/* microcode format is extended from prescott processors */
67631 +struct extended_signature {
67632 +       unsigned int sig;
67633 +       unsigned int pf;
67634 +       unsigned int cksum;
67635 +};
67636 +
67637 +struct extended_sigtable {
67638 +       unsigned int count;
67639 +       unsigned int cksum;
67640 +       unsigned int reserved[3];
67641 +       struct extended_signature sigs[0];
67642 +};
67643 +/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
67644 +#define MICROCODE_IOCFREE      _IO('6',0)
67645 +
67646 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
67647 +static inline void rep_nop(void)
67648 +{
67649 +       __asm__ __volatile__("rep;nop": : :"memory");
67650 +}
67651 +
67652 +#define cpu_relax()    rep_nop()
67653 +
67654 +/* generic versions from gas */
67655 +#define GENERIC_NOP1   ".byte 0x90\n"
67656 +#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
67657 +#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
67658 +#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
67659 +#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
67660 +#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
67661 +#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
67662 +#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
67663 +
67664 +/* Opteron nops */
67665 +#define K8_NOP1 GENERIC_NOP1
67666 +#define K8_NOP2        ".byte 0x66,0x90\n" 
67667 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
67668 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
67669 +#define K8_NOP5        K8_NOP3 K8_NOP2 
67670 +#define K8_NOP6        K8_NOP3 K8_NOP3
67671 +#define K8_NOP7        K8_NOP4 K8_NOP3
67672 +#define K8_NOP8        K8_NOP4 K8_NOP4
67673 +
67674 +/* K7 nops */
67675 +/* uses eax dependencies (arbitary choice) */
67676 +#define K7_NOP1  GENERIC_NOP1
67677 +#define K7_NOP2        ".byte 0x8b,0xc0\n" 
67678 +#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
67679 +#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
67680 +#define K7_NOP5        K7_NOP4 ASM_NOP1
67681 +#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
67682 +#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
67683 +#define K7_NOP8        K7_NOP7 ASM_NOP1
67684 +
67685 +#ifdef CONFIG_MK8
67686 +#define ASM_NOP1 K8_NOP1
67687 +#define ASM_NOP2 K8_NOP2
67688 +#define ASM_NOP3 K8_NOP3
67689 +#define ASM_NOP4 K8_NOP4
67690 +#define ASM_NOP5 K8_NOP5
67691 +#define ASM_NOP6 K8_NOP6
67692 +#define ASM_NOP7 K8_NOP7
67693 +#define ASM_NOP8 K8_NOP8
67694 +#elif defined(CONFIG_MK7)
67695 +#define ASM_NOP1 K7_NOP1
67696 +#define ASM_NOP2 K7_NOP2
67697 +#define ASM_NOP3 K7_NOP3
67698 +#define ASM_NOP4 K7_NOP4
67699 +#define ASM_NOP5 K7_NOP5
67700 +#define ASM_NOP6 K7_NOP6
67701 +#define ASM_NOP7 K7_NOP7
67702 +#define ASM_NOP8 K7_NOP8
67703 +#else
67704 +#define ASM_NOP1 GENERIC_NOP1
67705 +#define ASM_NOP2 GENERIC_NOP2
67706 +#define ASM_NOP3 GENERIC_NOP3
67707 +#define ASM_NOP4 GENERIC_NOP4
67708 +#define ASM_NOP5 GENERIC_NOP5
67709 +#define ASM_NOP6 GENERIC_NOP6
67710 +#define ASM_NOP7 GENERIC_NOP7
67711 +#define ASM_NOP8 GENERIC_NOP8
67712 +#endif
67713 +
67714 +#define ASM_NOP_MAX 8
67715 +
67716 +/* Prefetch instructions for Pentium III and AMD Athlon */
67717 +/* It's not worth to care about 3dnow! prefetches for the K6
67718 +   because they are microcoded there and very slow.
67719 +   However we don't do prefetches for pre XP Athlons currently
67720 +   That should be fixed. */
67721 +#define ARCH_HAS_PREFETCH
67722 +static inline void prefetch(const void *x)
67723 +{
67724 +       alternative_input(ASM_NOP4,
67725 +                         "prefetchnta (%1)",
67726 +                         X86_FEATURE_XMM,
67727 +                         "r" (x));
67728 +}
67729 +
67730 +#define ARCH_HAS_PREFETCH
67731 +#define ARCH_HAS_PREFETCHW
67732 +#define ARCH_HAS_SPINLOCK_PREFETCH
67733 +
67734 +/* 3dnow! prefetch to get an exclusive cache line. Useful for 
67735 +   spinlocks to avoid one state transition in the cache coherency protocol. */
67736 +static inline void prefetchw(const void *x)
67737 +{
67738 +       alternative_input(ASM_NOP4,
67739 +                         "prefetchw (%1)",
67740 +                         X86_FEATURE_3DNOW,
67741 +                         "r" (x));
67742 +}
67743 +#define spin_lock_prefetch(x)  prefetchw(x)
67744 +
67745 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
67746 +
67747 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
67748 +
67749 +extern unsigned long boot_option_idle_override;
67750 +extern void enable_sep_cpu(void);
67751 +extern int sysenter_setup(void);
67752 +
67753 +#ifdef CONFIG_MTRR
67754 +extern void mtrr_ap_init(void);
67755 +extern void mtrr_bp_init(void);
67756 +#else
67757 +#define mtrr_ap_init() do {} while (0)
67758 +#define mtrr_bp_init() do {} while (0)
67759 +#endif
67760 +
67761 +#ifdef CONFIG_X86_MCE
67762 +extern void mcheck_init(struct cpuinfo_x86 *c);
67763 +#else
67764 +#define mcheck_init(c) do {} while(0)
67765 +#endif
67766 +
67767 +#endif /* __ASM_I386_PROCESSOR_H */
67768 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/ptrace.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/ptrace.h
67769 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/ptrace.h   1970-01-01 01:00:00.000000000 +0100
67770 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/ptrace.h      2006-04-10 00:05:52.000000000 +0200
67771 @@ -0,0 +1,90 @@
67772 +#ifndef _I386_PTRACE_H
67773 +#define _I386_PTRACE_H
67774 +
67775 +#define EBX 0
67776 +#define ECX 1
67777 +#define EDX 2
67778 +#define ESI 3
67779 +#define EDI 4
67780 +#define EBP 5
67781 +#define EAX 6
67782 +#define DS 7
67783 +#define ES 8
67784 +#define FS 9
67785 +#define GS 10
67786 +#define ORIG_EAX 11
67787 +#define EIP 12
67788 +#define CS  13
67789 +#define EFL 14
67790 +#define UESP 15
67791 +#define SS   16
67792 +#define FRAME_SIZE 17
67793 +
67794 +/* this struct defines the way the registers are stored on the 
67795 +   stack during a system call. */
67796 +
67797 +struct pt_regs {
67798 +       long ebx;
67799 +       long ecx;
67800 +       long edx;
67801 +       long esi;
67802 +       long edi;
67803 +       long ebp;
67804 +       long eax;
67805 +       int  xds;
67806 +       int  xes;
67807 +       long orig_eax;
67808 +       long eip;
67809 +       int  xcs;
67810 +       long eflags;
67811 +       long esp;
67812 +       int  xss;
67813 +};
67814 +
67815 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
67816 +#define PTRACE_GETREGS            12
67817 +#define PTRACE_SETREGS            13
67818 +#define PTRACE_GETFPREGS          14
67819 +#define PTRACE_SETFPREGS          15
67820 +#define PTRACE_GETFPXREGS         18
67821 +#define PTRACE_SETFPXREGS         19
67822 +
67823 +#define PTRACE_OLDSETOPTIONS         21
67824 +
67825 +#define PTRACE_GET_THREAD_AREA    25
67826 +#define PTRACE_SET_THREAD_AREA    26
67827 +
67828 +#define PTRACE_SYSEMU            31
67829 +#define PTRACE_SYSEMU_SINGLESTEP  32
67830 +
67831 +#ifdef __KERNEL__
67832 +
67833 +#include <asm/vm86.h>
67834 +
67835 +struct task_struct;
67836 +extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
67837 +
67838 +/*
67839 + * user_mode_vm(regs) determines whether a register set came from user mode.
67840 + * This is true if V8086 mode was enabled OR if the register set was from
67841 + * protected mode with RPL-3 CS value.  This tricky test checks that with
67842 + * one comparison.  Many places in the kernel can bypass this full check
67843 + * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
67844 + */
67845 +static inline int user_mode(struct pt_regs *regs)
67846 +{
67847 +       return (regs->xcs & 2) != 0;
67848 +}
67849 +static inline int user_mode_vm(struct pt_regs *regs)
67850 +{
67851 +       return ((regs->xcs & 2) | (regs->eflags & VM_MASK)) != 0;
67852 +}
67853 +#define instruction_pointer(regs) ((regs)->eip)
67854 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
67855 +extern unsigned long profile_pc(struct pt_regs *regs);
67856 +#else
67857 +#define profile_pc(regs) instruction_pointer(regs)
67858 +#endif
67859 +#endif /* __KERNEL__ */
67860 +
67861 +#endif
67862 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/scatterlist.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/scatterlist.h
67863 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/scatterlist.h      1970-01-01 01:00:00.000000000 +0100
67864 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/scatterlist.h 2006-04-10 00:05:52.000000000 +0200
67865 @@ -0,0 +1,22 @@
67866 +#ifndef _I386_SCATTERLIST_H
67867 +#define _I386_SCATTERLIST_H
67868 +
67869 +struct scatterlist {
67870 +    struct page                *page;
67871 +    unsigned int       offset;
67872 +    unsigned int       length;
67873 +    dma_addr_t         dma_address;
67874 +    unsigned int       dma_length;
67875 +};
67876 +
67877 +/* These macros should be used after a pci_map_sg call has been done
67878 + * to get bus addresses of each of the SG entries and their lengths.
67879 + * You should only work with the number of sg entries pci_map_sg
67880 + * returns.
67881 + */
67882 +#define sg_dma_address(sg)     ((sg)->dma_address)
67883 +#define sg_dma_len(sg)         ((sg)->dma_length)
67884 +
67885 +#define ISA_DMA_THRESHOLD (0x00ffffff)
67886 +
67887 +#endif /* !(_I386_SCATTERLIST_H) */
67888 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/segment.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/segment.h
67889 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/segment.h  1970-01-01 01:00:00.000000000 +0100
67890 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/segment.h     2006-04-10 00:05:52.000000000 +0200
67891 @@ -0,0 +1,117 @@
67892 +#ifndef _ASM_SEGMENT_H
67893 +#define _ASM_SEGMENT_H
67894 +
67895 +/*
67896 + * The layout of the per-CPU GDT under Linux:
67897 + *
67898 + *   0 - null
67899 + *   1 - reserved
67900 + *   2 - reserved
67901 + *   3 - reserved
67902 + *
67903 + *   4 - unused                        <==== new cacheline
67904 + *   5 - unused
67905 + *
67906 + *  ------- start of TLS (Thread-Local Storage) segments:
67907 + *
67908 + *   6 - TLS segment #1                        [ glibc's TLS segment ]
67909 + *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
67910 + *   8 - TLS segment #3
67911 + *   9 - reserved
67912 + *  10 - reserved
67913 + *  11 - reserved
67914 + *
67915 + *  ------- start of kernel segments:
67916 + *
67917 + *  12 - kernel code segment           <==== new cacheline
67918 + *  13 - kernel data segment
67919 + *  14 - default user CS
67920 + *  15 - default user DS
67921 + *  16 - TSS
67922 + *  17 - LDT
67923 + *  18 - PNPBIOS support (16->32 gate)
67924 + *  19 - PNPBIOS support
67925 + *  20 - PNPBIOS support
67926 + *  21 - PNPBIOS support
67927 + *  22 - PNPBIOS support
67928 + *  23 - APM BIOS support
67929 + *  24 - APM BIOS support
67930 + *  25 - APM BIOS support 
67931 + *
67932 + *  26 - ESPFIX small SS
67933 + *  27 - unused
67934 + *  28 - unused
67935 + *  29 - unused
67936 + *  30 - unused
67937 + *  31 - TSS for double fault handler
67938 + */
67939 +#define GDT_ENTRY_TLS_ENTRIES  3
67940 +#define GDT_ENTRY_TLS_MIN      6
67941 +#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
67942 +
67943 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
67944 +
67945 +#define GDT_ENTRY_DEFAULT_USER_CS      14
67946 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
67947 +
67948 +#define GDT_ENTRY_DEFAULT_USER_DS      15
67949 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
67950 +
67951 +#define GDT_ENTRY_KERNEL_BASE  12
67952 +
67953 +#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
67954 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
67955 +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
67956 +
67957 +#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
67958 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
67959 +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
67960 +
67961 +#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
67962 +#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
67963 +
67964 +#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
67965 +#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
67966 +
67967 +#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
67968 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
67969 +
67970 +#define GDT_ENTRY_DOUBLEFAULT_TSS      31
67971 +
67972 +/*
67973 + * The GDT has 32 entries
67974 + */
67975 +#define GDT_ENTRIES 32
67976 +
67977 +#define GDT_SIZE (GDT_ENTRIES * 8)
67978 +
67979 +/* Simple and small GDT entries for booting only */
67980 +
67981 +#define GDT_ENTRY_BOOT_CS              2
67982 +#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
67983 +
67984 +#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
67985 +#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
67986 +
67987 +/* The PnP BIOS entries in the GDT */
67988 +#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
67989 +#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
67990 +#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
67991 +#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
67992 +#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
67993 +
67994 +/* The PnP BIOS selectors */
67995 +#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
67996 +#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
67997 +#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
67998 +#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
67999 +#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
68000 +
68001 +/*
68002 + * The interrupt descriptor table has room for 256 idt's,
68003 + * the global descriptor table is dependent on the number
68004 + * of tasks we can have..
68005 + */
68006 +#define IDT_ENTRIES 256
68007 +
68008 +#endif
68009 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/setup.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/setup.h
68010 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/setup.h    1970-01-01 01:00:00.000000000 +0100
68011 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/setup.h       2006-04-10 00:05:52.000000000 +0200
68012 @@ -0,0 +1,66 @@
68013 +/*
68014 + *     Just a place holder. We don't want to have to test x86 before
68015 + *     we include stuff
68016 + */
68017 +
68018 +#ifndef _i386_SETUP_H
68019 +#define _i386_SETUP_H
68020 +
68021 +#define PFN_UP(x)      (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
68022 +#define PFN_DOWN(x)    ((x) >> PAGE_SHIFT)
68023 +#define PFN_PHYS(x)    ((unsigned long long)(x) << PAGE_SHIFT)
68024 +
68025 +/*
68026 + * Reserved space for vmalloc and iomap - defined in asm/page.h
68027 + */
68028 +#define MAXMEM_PFN     PFN_DOWN(MAXMEM)
68029 +#define MAX_NONPAE_PFN (1 << 20)
68030 +
68031 +#define PARAM_SIZE 4096
68032 +#define COMMAND_LINE_SIZE 256
68033 +
68034 +#define OLD_CL_MAGIC_ADDR      0x90020
68035 +#define OLD_CL_MAGIC           0xA33F
68036 +#define OLD_CL_BASE_ADDR       0x90000
68037 +#define OLD_CL_OFFSET          0x90022
68038 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
68039 +
68040 +#ifndef __ASSEMBLY__
68041 +/*
68042 + * This is set up by the setup-routine at boot-time
68043 + */
68044 +extern unsigned char boot_params[PARAM_SIZE];
68045 +
68046 +#define PARAM  (boot_params)
68047 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
68048 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
68049 +#define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0))
68050 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
68051 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
68052 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
68053 +#define IST_INFO   (*(struct ist_info *) (PARAM+0x60))
68054 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
68055 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
68056 +#define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
68057 +#define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
68058 +#define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
68059 +#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
68060 +#define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
68061 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
68062 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
68063 +#define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
68064 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
68065 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
68066 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
68067 +#define KERNEL_START (*(unsigned long *) (PARAM+0x214))
68068 +#define INITRD_START (__pa(xen_start_info->mod_start))
68069 +#define INITRD_SIZE (xen_start_info->mod_len)
68070 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
68071 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
68072 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
68073 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
68074 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
68075 +
68076 +#endif /* __ASSEMBLY__ */
68077 +
68078 +#endif /* _i386_SETUP_H */
68079 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/smp.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/smp.h
68080 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/smp.h      1970-01-01 01:00:00.000000000 +0100
68081 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/smp.h 2006-04-10 00:05:52.000000000 +0200
68082 @@ -0,0 +1,103 @@
68083 +#ifndef __ASM_SMP_H
68084 +#define __ASM_SMP_H
68085 +
68086 +/*
68087 + * We need the APIC definitions automatically as part of 'smp.h'
68088 + */
68089 +#ifndef __ASSEMBLY__
68090 +#include <linux/config.h>
68091 +#include <linux/kernel.h>
68092 +#include <linux/threads.h>
68093 +#include <linux/cpumask.h>
68094 +#endif
68095 +
68096 +#ifdef CONFIG_X86_LOCAL_APIC
68097 +#ifndef __ASSEMBLY__
68098 +#include <asm/fixmap.h>
68099 +#include <asm/bitops.h>
68100 +#include <asm/mpspec.h>
68101 +#ifdef CONFIG_X86_IO_APIC
68102 +#include <asm/io_apic.h>
68103 +#endif
68104 +#include <asm/apic.h>
68105 +#endif
68106 +#endif
68107 +
68108 +#define BAD_APICID 0xFFu
68109 +#ifdef CONFIG_SMP
68110 +#ifndef __ASSEMBLY__
68111 +
68112 +/*
68113 + * Private routines/data
68114 + */
68115
68116 +extern void smp_alloc_memory(void);
68117 +extern int pic_mode;
68118 +extern int smp_num_siblings;
68119 +extern cpumask_t cpu_sibling_map[];
68120 +extern cpumask_t cpu_core_map[];
68121 +
68122 +extern void (*mtrr_hook) (void);
68123 +extern void zap_low_mappings (void);
68124 +extern void lock_ipi_call_lock(void);
68125 +extern void unlock_ipi_call_lock(void);
68126 +
68127 +#define MAX_APICID 256
68128 +extern u8 x86_cpu_to_apicid[];
68129 +
68130 +#define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
68131 +
68132 +#ifdef CONFIG_HOTPLUG_CPU
68133 +extern void cpu_exit_clear(void);
68134 +extern void cpu_uninit(void);
68135 +#endif
68136 +
68137 +/*
68138 + * This function is needed by all SMP systems. It must _always_ be valid
68139 + * from the initial startup. We map APIC_BASE very early in page_setup(),
68140 + * so this is correct in the x86 case.
68141 + */
68142 +#define raw_smp_processor_id() (current_thread_info()->cpu)
68143 +
68144 +extern cpumask_t cpu_possible_map;
68145 +#define cpu_callin_map cpu_possible_map
68146 +
68147 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
68148 +static inline int num_booting_cpus(void)
68149 +{
68150 +       return cpus_weight(cpu_possible_map);
68151 +}
68152 +
68153 +#ifdef CONFIG_X86_LOCAL_APIC
68154 +
68155 +#ifdef APIC_DEFINITION
68156 +extern int hard_smp_processor_id(void);
68157 +#else
68158 +#include <mach_apicdef.h>
68159 +static inline int hard_smp_processor_id(void)
68160 +{
68161 +       /* we don't want to mark this access volatile - bad code generation */
68162 +       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
68163 +}
68164 +#endif
68165 +
68166 +static __inline int logical_smp_processor_id(void)
68167 +{
68168 +       /* we don't want to mark this access volatile - bad code generation */
68169 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
68170 +}
68171 +
68172 +#endif
68173 +
68174 +extern int __cpu_disable(void);
68175 +extern void __cpu_die(unsigned int cpu);
68176 +#endif /* !__ASSEMBLY__ */
68177 +
68178 +#else /* CONFIG_SMP */
68179 +
68180 +#define cpu_physical_id(cpu)           boot_cpu_physical_apicid
68181 +
68182 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
68183 +
68184 +#endif
68185 +#endif
68186 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/spinlock.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/spinlock.h
68187 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/spinlock.h 1970-01-01 01:00:00.000000000 +0100
68188 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/spinlock.h    2006-04-10 00:05:52.000000000 +0200
68189 @@ -0,0 +1,217 @@
68190 +#ifndef __ASM_SPINLOCK_H
68191 +#define __ASM_SPINLOCK_H
68192 +
68193 +#include <asm/atomic.h>
68194 +#include <asm/rwlock.h>
68195 +#include <asm/page.h>
68196 +#include <linux/config.h>
68197 +#include <linux/compiler.h>
68198 +#include <asm/smp_alt.h>
68199 +
68200 +/*
68201 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
68202 + *
68203 + * Simple spin lock operations.  There are two variants, one clears IRQ's
68204 + * on the local processor, one does not.
68205 + *
68206 + * We make no fairness assumptions. They have a cost.
68207 + *
68208 + * (the type definitions are in asm/spinlock_types.h)
68209 + */
68210 +
68211 +#define __raw_spin_is_locked(x) \
68212 +               (*(volatile signed char *)(&(x)->slock) <= 0)
68213 +
68214 +#define __raw_spin_lock_string \
68215 +       "\n1:\n" \
68216 +       LOCK \
68217 +       "decb %0\n\t" \
68218 +       "jns 3f\n" \
68219 +       "2:\t" \
68220 +       "rep;nop\n\t" \
68221 +       "cmpb $0,%0\n\t" \
68222 +       "jle 2b\n\t" \
68223 +       "jmp 1b\n" \
68224 +       "3:\n\t"
68225 +
68226 +#define __raw_spin_lock_string_flags \
68227 +       "\n1:\n" \
68228 +       LOCK \
68229 +       "decb %0\n\t" \
68230 +       "jns 4f\n\t" \
68231 +       "2:\t" \
68232 +       "testl $0x200, %1\n\t" \
68233 +       "jz 3f\n\t" \
68234 +       "#sti\n\t" \
68235 +       "3:\t" \
68236 +       "rep;nop\n\t" \
68237 +       "cmpb $0, %0\n\t" \
68238 +       "jle 3b\n\t" \
68239 +       "#cli\n\t" \
68240 +       "jmp 1b\n" \
68241 +       "4:\n\t"
68242 +
68243 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
68244 +{
68245 +       __asm__ __volatile__(
68246 +               __raw_spin_lock_string
68247 +               :"=m" (lock->slock) : : "memory");
68248 +}
68249 +
68250 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
68251 +{
68252 +       __asm__ __volatile__(
68253 +               __raw_spin_lock_string_flags
68254 +               :"=m" (lock->slock) : "r" (flags) : "memory");
68255 +}
68256 +
68257 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
68258 +{
68259 +       char oldval;
68260 +#ifdef CONFIG_SMP_ALTERNATIVES
68261 +       __asm__ __volatile__(
68262 +               "1:movb %1,%b0\n"
68263 +               "movb $0,%1\n"
68264 +               "2:"
68265 +               ".section __smp_alternatives,\"a\"\n"
68266 +               ".long 1b\n"
68267 +               ".long 3f\n"
68268 +               ".previous\n"
68269 +               ".section __smp_replacements,\"a\"\n"
68270 +               "3: .byte 2b - 1b\n"
68271 +               ".byte 5f-4f\n"
68272 +               ".byte 0\n"
68273 +               ".byte 6f-5f\n"
68274 +               ".byte -1\n"
68275 +               "4: xchgb %b0,%1\n"
68276 +               "5: movb %1,%b0\n"
68277 +               "movb $0,%1\n"
68278 +               "6:\n"
68279 +               ".previous\n"
68280 +               :"=q" (oldval), "=m" (lock->slock)
68281 +               :"0" (0) : "memory");
68282 +#else
68283 +       __asm__ __volatile__(
68284 +               "xchgb %b0,%1"
68285 +               :"=q" (oldval), "=m" (lock->slock)
68286 +               :"0" (0) : "memory");
68287 +#endif
68288 +       return oldval > 0;
68289 +}
68290 +
68291 +/*
68292 + * __raw_spin_unlock based on writing $1 to the low byte.
68293 + * This method works. Despite all the confusion.
68294 + * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
68295 + * (PPro errata 66, 92)
68296 + */
68297 +
68298 +#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
68299 +
68300 +#define __raw_spin_unlock_string \
68301 +       "movb $1,%0" \
68302 +               :"=m" (lock->slock) : : "memory"
68303 +
68304 +
68305 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
68306 +{
68307 +       __asm__ __volatile__(
68308 +               __raw_spin_unlock_string
68309 +       );
68310 +}
68311 +
68312 +#else
68313 +
68314 +#define __raw_spin_unlock_string \
68315 +       "xchgb %b0, %1" \
68316 +               :"=q" (oldval), "=m" (lock->slock) \
68317 +               :"0" (oldval) : "memory"
68318 +
68319 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
68320 +{
68321 +       char oldval = 1;
68322 +
68323 +       __asm__ __volatile__(
68324 +               __raw_spin_unlock_string
68325 +       );
68326 +}
68327 +
68328 +#endif
68329 +
68330 +#define __raw_spin_unlock_wait(lock) \
68331 +       do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
68332 +
68333 +/*
68334 + * Read-write spinlocks, allowing multiple readers
68335 + * but only one writer.
68336 + *
68337 + * NOTE! it is quite common to have readers in interrupts
68338 + * but no interrupt writers. For those circumstances we
68339 + * can "mix" irq-safe locks - any writer needs to get a
68340 + * irq-safe write-lock, but readers can get non-irqsafe
68341 + * read-locks.
68342 + *
68343 + * On x86, we implement read-write locks as a 32-bit counter
68344 + * with the high bit (sign) being the "contended" bit.
68345 + *
68346 + * The inline assembly is non-obvious. Think about it.
68347 + *
68348 + * Changed to use the same technique as rw semaphores.  See
68349 + * semaphore.h for details.  -ben
68350 + *
68351 + * the helpers are in arch/i386/kernel/semaphore.c
68352 + */
68353 +
68354 +/**
68355 + * read_can_lock - would read_trylock() succeed?
68356 + * @lock: the rwlock in question.
68357 + */
68358 +#define __raw_read_can_lock(x)         ((int)(x)->lock > 0)
68359 +
68360 +/**
68361 + * write_can_lock - would write_trylock() succeed?
68362 + * @lock: the rwlock in question.
68363 + */
68364 +#define __raw_write_can_lock(x)                ((x)->lock == RW_LOCK_BIAS)
68365 +
68366 +static inline void __raw_read_lock(raw_rwlock_t *rw)
68367 +{
68368 +       __build_read_lock(rw, "__read_lock_failed");
68369 +}
68370 +
68371 +static inline void __raw_write_lock(raw_rwlock_t *rw)
68372 +{
68373 +       __build_write_lock(rw, "__write_lock_failed");
68374 +}
68375 +
68376 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
68377 +{
68378 +       atomic_t *count = (atomic_t *)lock;
68379 +       atomic_dec(count);
68380 +       if (atomic_read(count) >= 0)
68381 +               return 1;
68382 +       atomic_inc(count);
68383 +       return 0;
68384 +}
68385 +
68386 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
68387 +{
68388 +       atomic_t *count = (atomic_t *)lock;
68389 +       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
68390 +               return 1;
68391 +       atomic_add(RW_LOCK_BIAS, count);
68392 +       return 0;
68393 +}
68394 +
68395 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
68396 +{
68397 +       asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
68398 +}
68399 +
68400 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
68401 +{
68402 +       asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
68403 +                                : "=m" (rw->lock) : : "memory");
68404 +}
68405 +
68406 +#endif /* __ASM_SPINLOCK_H */
68407 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/swiotlb.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/swiotlb.h
68408 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/swiotlb.h  1970-01-01 01:00:00.000000000 +0100
68409 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/swiotlb.h     2006-04-10 00:05:52.000000000 +0200
68410 @@ -0,0 +1,43 @@
68411 +#ifndef _ASM_SWIOTLB_H
68412 +#define _ASM_SWIOTLB_H 1
68413 +
68414 +#include <linux/config.h>
68415 +
68416 +/* SWIOTLB interface */
68417 +
68418 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
68419 +                                     int dir);
68420 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
68421 +                                 size_t size, int dir);
68422 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
68423 +                                        dma_addr_t dev_addr,
68424 +                                        size_t size, int dir);
68425 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
68426 +                                           dma_addr_t dev_addr,
68427 +                                           size_t size, int dir);
68428 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
68429 +                                    struct scatterlist *sg, int nelems,
68430 +                                    int dir);
68431 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
68432 +                                       struct scatterlist *sg, int nelems,
68433 +                                       int dir);
68434 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
68435 +                     int nents, int direction);
68436 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
68437 +                        int nents, int direction);
68438 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
68439 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
68440 +                                   unsigned long offset, size_t size,
68441 +                                   enum dma_data_direction direction);
68442 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
68443 +                               size_t size, enum dma_data_direction direction);
68444 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
68445 +extern void swiotlb_init(void);
68446 +
68447 +#ifdef CONFIG_SWIOTLB
68448 +extern int swiotlb;
68449 +#else
68450 +#define swiotlb 0
68451 +#endif
68452 +
68453 +#endif
68454 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/synch_bitops.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/synch_bitops.h
68455 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/synch_bitops.h     1970-01-01 01:00:00.000000000 +0100
68456 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/synch_bitops.h        2006-04-10 00:05:52.000000000 +0200
68457 @@ -0,0 +1,141 @@
68458 +#ifndef __XEN_SYNCH_BITOPS_H__
68459 +#define __XEN_SYNCH_BITOPS_H__
68460 +
68461 +/*
68462 + * Copyright 1992, Linus Torvalds.
68463 + * Heavily modified to provide guaranteed strong synchronisation
68464 + * when communicating with Xen or other guest OSes running on other CPUs.
68465 + */
68466 +
68467 +#include <linux/config.h>
68468 +
68469 +#define ADDR (*(volatile long *) addr)
68470 +
68471 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
68472 +{
68473 +    __asm__ __volatile__ ( 
68474 +        "lock btsl %1,%0"
68475 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
68476 +}
68477 +
68478 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
68479 +{
68480 +    __asm__ __volatile__ (
68481 +        "lock btrl %1,%0"
68482 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
68483 +}
68484 +
68485 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
68486 +{
68487 +    __asm__ __volatile__ (
68488 +        "lock btcl %1,%0"
68489 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
68490 +}
68491 +
68492 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
68493 +{
68494 +    int oldbit;
68495 +    __asm__ __volatile__ (
68496 +        "lock btsl %2,%1\n\tsbbl %0,%0"
68497 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
68498 +    return oldbit;
68499 +}
68500 +
68501 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
68502 +{
68503 +    int oldbit;
68504 +    __asm__ __volatile__ (
68505 +        "lock btrl %2,%1\n\tsbbl %0,%0"
68506 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
68507 +    return oldbit;
68508 +}
68509 +
68510 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
68511 +{
68512 +    int oldbit;
68513 +
68514 +    __asm__ __volatile__ (
68515 +        "lock btcl %2,%1\n\tsbbl %0,%0"
68516 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
68517 +    return oldbit;
68518 +}
68519 +
68520 +struct __synch_xchg_dummy { unsigned long a[100]; };
68521 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
68522 +
68523 +#define synch_cmpxchg(ptr, old, new) \
68524 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
68525 +                                     (unsigned long)(old), \
68526 +                                     (unsigned long)(new), \
68527 +                                     sizeof(*(ptr))))
68528 +
68529 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
68530 +                                           unsigned long old,
68531 +                                           unsigned long new, int size)
68532 +{
68533 +       unsigned long prev;
68534 +       switch (size) {
68535 +       case 1:
68536 +               __asm__ __volatile__("lock; cmpxchgb %b1,%2"
68537 +                                    : "=a"(prev)
68538 +                                    : "q"(new), "m"(*__synch_xg(ptr)),
68539 +                                      "0"(old)
68540 +                                    : "memory");
68541 +               return prev;
68542 +       case 2:
68543 +               __asm__ __volatile__("lock; cmpxchgw %w1,%2"
68544 +                                    : "=a"(prev)
68545 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68546 +                                      "0"(old)
68547 +                                    : "memory");
68548 +               return prev;
68549 +#ifdef CONFIG_X86_64
68550 +       case 4:
68551 +               __asm__ __volatile__("lock; cmpxchgl %k1,%2"
68552 +                                    : "=a"(prev)
68553 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68554 +                                      "0"(old)
68555 +                                    : "memory");
68556 +               return prev;
68557 +       case 8:
68558 +               __asm__ __volatile__("lock; cmpxchgq %1,%2"
68559 +                                    : "=a"(prev)
68560 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68561 +                                      "0"(old)
68562 +                                    : "memory");
68563 +               return prev;
68564 +#else
68565 +       case 4:
68566 +               __asm__ __volatile__("lock; cmpxchgl %1,%2"
68567 +                                    : "=a"(prev)
68568 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68569 +                                      "0"(old)
68570 +                                    : "memory");
68571 +               return prev;
68572 +#endif
68573 +       }
68574 +       return old;
68575 +}
68576 +
68577 +static __always_inline int synch_const_test_bit(int nr,
68578 +                                               const volatile void * addr)
68579 +{
68580 +    return ((1UL << (nr & 31)) & 
68581 +            (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
68582 +}
68583 +
68584 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
68585 +{
68586 +    int oldbit;
68587 +    __asm__ __volatile__ (
68588 +        "btl %2,%1\n\tsbbl %0,%0"
68589 +        : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
68590 +    return oldbit;
68591 +}
68592 +
68593 +#define synch_test_bit(nr,addr) \
68594 +(__builtin_constant_p(nr) ? \
68595 + synch_const_test_bit((nr),(addr)) : \
68596 + synch_var_test_bit((nr),(addr)))
68597 +
68598 +#endif /* __XEN_SYNCH_BITOPS_H__ */
68599 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/system.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/system.h
68600 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/system.h   1970-01-01 01:00:00.000000000 +0100
68601 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/system.h      2006-04-10 00:05:52.000000000 +0200
68602 @@ -0,0 +1,679 @@
68603 +#ifndef __ASM_SYSTEM_H
68604 +#define __ASM_SYSTEM_H
68605 +
68606 +#include <linux/config.h>
68607 +#include <linux/kernel.h>
68608 +#include <linux/bitops.h>
68609 +#include <asm/synch_bitops.h>
68610 +#include <asm/segment.h>
68611 +#include <asm/cpufeature.h>
68612 +#include <asm/hypervisor.h>
68613 +#include <asm/smp_alt.h>
68614 +
68615 +#ifdef __KERNEL__
68616 +
68617 +#ifdef CONFIG_SMP
68618 +#define __vcpu_id smp_processor_id()
68619 +#else
68620 +#define __vcpu_id 0
68621 +#endif
68622 +
68623 +struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
68624 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
68625 +
68626 +#define switch_to(prev,next,last) do {                                 \
68627 +       unsigned long esi,edi;                                          \
68628 +       asm volatile("pushl %%ebp\n\t"                                  \
68629 +                    "movl %%esp,%0\n\t"        /* save ESP */          \
68630 +                    "movl %5,%%esp\n\t"        /* restore ESP */       \
68631 +                    "movl $1f,%1\n\t"          /* save EIP */          \
68632 +                    "pushl %6\n\t"             /* restore EIP */       \
68633 +                    "jmp __switch_to\n"                                \
68634 +                    "1:\t"                                             \
68635 +                    "popl %%ebp\n\t"                                   \
68636 +                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
68637 +                     "=a" (last),"=S" (esi),"=D" (edi)                 \
68638 +                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
68639 +                     "2" (prev), "d" (next));                          \
68640 +} while (0)
68641 +
68642 +#define _set_base(addr,base) do { unsigned long __pr; \
68643 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
68644 +       "rorl $16,%%edx\n\t" \
68645 +       "movb %%dl,%2\n\t" \
68646 +       "movb %%dh,%3" \
68647 +       :"=&d" (__pr) \
68648 +       :"m" (*((addr)+2)), \
68649 +        "m" (*((addr)+4)), \
68650 +        "m" (*((addr)+7)), \
68651 +         "0" (base) \
68652 +        ); } while(0)
68653 +
68654 +#define _set_limit(addr,limit) do { unsigned long __lr; \
68655 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
68656 +       "rorl $16,%%edx\n\t" \
68657 +       "movb %2,%%dh\n\t" \
68658 +       "andb $0xf0,%%dh\n\t" \
68659 +       "orb %%dh,%%dl\n\t" \
68660 +       "movb %%dl,%2" \
68661 +       :"=&d" (__lr) \
68662 +       :"m" (*(addr)), \
68663 +        "m" (*((addr)+6)), \
68664 +        "0" (limit) \
68665 +        ); } while(0)
68666 +
68667 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
68668 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
68669 +
68670 +/*
68671 + * Load a segment. Fall back on loading the zero
68672 + * segment if something goes wrong..
68673 + */
68674 +#define loadsegment(seg,value)                 \
68675 +       asm volatile("\n"                       \
68676 +               "1:\t"                          \
68677 +               "mov %0,%%" #seg "\n"           \
68678 +               "2:\n"                          \
68679 +               ".section .fixup,\"ax\"\n"      \
68680 +               "3:\t"                          \
68681 +               "pushl $0\n\t"                  \
68682 +               "popl %%" #seg "\n\t"           \
68683 +               "jmp 2b\n"                      \
68684 +               ".previous\n"                   \
68685 +               ".section __ex_table,\"a\"\n\t" \
68686 +               ".align 4\n\t"                  \
68687 +               ".long 1b,3b\n"                 \
68688 +               ".previous"                     \
68689 +               : :"rm" (value))
68690 +
68691 +/*
68692 + * Save a segment register away
68693 + */
68694 +#define savesegment(seg, value) \
68695 +       asm volatile("mov %%" #seg ",%0":"=rm" (value))
68696 +
68697 +/*
68698 + * Clear and set 'TS' bit respectively
68699 + */
68700 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
68701 +#define read_cr0() ({ \
68702 +       unsigned int __dummy; \
68703 +       __asm__ __volatile__( \
68704 +               "movl %%cr0,%0\n\t" \
68705 +               :"=r" (__dummy)); \
68706 +       __dummy; \
68707 +})
68708 +#define write_cr0(x) \
68709 +       __asm__ __volatile__("movl %0,%%cr0": :"r" (x));
68710 +
68711 +#define read_cr2() \
68712 +       (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
68713 +#define write_cr2(x) \
68714 +       __asm__ __volatile__("movl %0,%%cr2": :"r" (x));
68715 +
68716 +#define read_cr3() ({ \
68717 +       unsigned int __dummy; \
68718 +       __asm__ ( \
68719 +               "movl %%cr3,%0\n\t" \
68720 +               :"=r" (__dummy)); \
68721 +       machine_to_phys(__dummy); \
68722 +})
68723 +#define write_cr3(x) ({                                                \
68724 +       maddr_t __dummy = phys_to_machine(x);                   \
68725 +       __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy));  \
68726 +})
68727 +
68728 +#define read_cr4() ({ \
68729 +       unsigned int __dummy; \
68730 +       __asm__( \
68731 +               "movl %%cr4,%0\n\t" \
68732 +               :"=r" (__dummy)); \
68733 +       __dummy; \
68734 +})
68735 +
68736 +#define read_cr4_safe() ({                           \
68737 +       unsigned int __dummy;                         \
68738 +       /* This could fault if %cr4 does not exist */ \
68739 +       __asm__("1: movl %%cr4, %0              \n"   \
68740 +               "2:                             \n"   \
68741 +               ".section __ex_table,\"a\"      \n"   \
68742 +               ".long 1b,2b                    \n"   \
68743 +               ".previous                      \n"   \
68744 +               : "=r" (__dummy): "0" (0));           \
68745 +       __dummy;                                      \
68746 +})
68747 +
68748 +#define write_cr4(x) \
68749 +       __asm__ __volatile__("movl %0,%%cr4": :"r" (x));
68750 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
68751 +
68752 +#endif /* __KERNEL__ */
68753 +
68754 +#define wbinvd() \
68755 +       __asm__ __volatile__ ("wbinvd": : :"memory");
68756 +
68757 +static inline unsigned long get_limit(unsigned long segment)
68758 +{
68759 +       unsigned long __limit;
68760 +       __asm__("lsll %1,%0"
68761 +               :"=r" (__limit):"r" (segment));
68762 +       return __limit+1;
68763 +}
68764 +
68765 +#define nop() __asm__ __volatile__ ("nop")
68766 +
68767 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
68768 +
68769 +#define tas(ptr) (xchg((ptr),1))
68770 +
68771 +struct __xchg_dummy { unsigned long a[100]; };
68772 +#define __xg(x) ((struct __xchg_dummy *)(x))
68773 +
68774 +
68775 +#ifdef CONFIG_X86_CMPXCHG64
68776 +
68777 +/*
68778 + * The semantics of XCHGCMP8B are a bit strange, this is why
68779 + * there is a loop and the loading of %%eax and %%edx has to
68780 + * be inside. This inlines well in most cases, the cached
68781 + * cost is around ~38 cycles. (in the future we might want
68782 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
68783 + * might have an implicit FPU-save as a cost, so it's not
68784 + * clear which path to go.)
68785 + *
68786 + * cmpxchg8b must be used with the lock prefix here to allow
68787 + * the instruction to be executed atomically, see page 3-102
68788 + * of the instruction set reference 24319102.pdf. We need
68789 + * the reader side to see the coherent 64bit value.
68790 + */
68791 +static inline void __set_64bit (unsigned long long * ptr,
68792 +               unsigned int low, unsigned int high)
68793 +{
68794 +       __asm__ __volatile__ (
68795 +               "\n1:\t"
68796 +               "movl (%0), %%eax\n\t"
68797 +               "movl 4(%0), %%edx\n\t"
68798 +               "lock cmpxchg8b (%0)\n\t"
68799 +               "jnz 1b"
68800 +               : /* no outputs */
68801 +               :       "D"(ptr),
68802 +                       "b"(low),
68803 +                       "c"(high)
68804 +               :       "ax","dx","memory");
68805 +}
68806 +
68807 +static inline void __set_64bit_constant (unsigned long long *ptr,
68808 +                                                unsigned long long value)
68809 +{
68810 +       __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
68811 +}
68812 +#define ll_low(x)      *(((unsigned int*)&(x))+0)
68813 +#define ll_high(x)     *(((unsigned int*)&(x))+1)
68814 +
68815 +static inline void __set_64bit_var (unsigned long long *ptr,
68816 +                        unsigned long long value)
68817 +{
68818 +       __set_64bit(ptr,ll_low(value), ll_high(value));
68819 +}
68820 +
68821 +#define set_64bit(ptr,value) \
68822 +(__builtin_constant_p(value) ? \
68823 + __set_64bit_constant(ptr, value) : \
68824 + __set_64bit_var(ptr, value) )
68825 +
68826 +#define _set_64bit(ptr,value) \
68827 +(__builtin_constant_p(value) ? \
68828 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
68829 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
68830 +
68831 +#endif
68832 +
68833 +/*
68834 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
68835 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
68836 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
68837 + */
68838 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
68839 +{
68840 +       switch (size) {
68841 +               case 1:
68842 +                       __asm__ __volatile__("xchgb %b0,%1"
68843 +                               :"=q" (x)
68844 +                               :"m" (*__xg(ptr)), "0" (x)
68845 +                               :"memory");
68846 +                       break;
68847 +               case 2:
68848 +                       __asm__ __volatile__("xchgw %w0,%1"
68849 +                               :"=r" (x)
68850 +                               :"m" (*__xg(ptr)), "0" (x)
68851 +                               :"memory");
68852 +                       break;
68853 +               case 4:
68854 +                       __asm__ __volatile__("xchgl %0,%1"
68855 +                               :"=r" (x)
68856 +                               :"m" (*__xg(ptr)), "0" (x)
68857 +                               :"memory");
68858 +                       break;
68859 +       }
68860 +       return x;
68861 +}
68862 +
68863 +/*
68864 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
68865 + * store NEW in MEM.  Return the initial value in MEM.  Success is
68866 + * indicated by comparing RETURN with OLD.
68867 + */
68868 +
68869 +#ifdef CONFIG_X86_CMPXCHG
68870 +#define __HAVE_ARCH_CMPXCHG 1
68871 +#define cmpxchg(ptr,o,n)\
68872 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
68873 +                                       (unsigned long)(n),sizeof(*(ptr))))
68874 +#endif
68875 +
68876 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
68877 +                                     unsigned long new, int size)
68878 +{
68879 +       unsigned long prev;
68880 +       switch (size) {
68881 +       case 1:
68882 +               __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
68883 +                                    : "=a"(prev)
68884 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
68885 +                                    : "memory");
68886 +               return prev;
68887 +       case 2:
68888 +               __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
68889 +                                    : "=a"(prev)
68890 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
68891 +                                    : "memory");
68892 +               return prev;
68893 +       case 4:
68894 +               __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
68895 +                                    : "=a"(prev)
68896 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
68897 +                                    : "memory");
68898 +               return prev;
68899 +       }
68900 +       return old;
68901 +}
68902 +
68903 +#ifndef CONFIG_X86_CMPXCHG
68904 +/*
68905 + * Building a kernel capable running on 80386. It may be necessary to
68906 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
68907 + * a function for each of the sizes we support.
68908 + */
68909 +
68910 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
68911 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
68912 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
68913 +
68914 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
68915 +                                     unsigned long new, int size)
68916 +{
68917 +       switch (size) {
68918 +       case 1:
68919 +               return cmpxchg_386_u8(ptr, old, new);
68920 +       case 2:
68921 +               return cmpxchg_386_u16(ptr, old, new);
68922 +       case 4:
68923 +               return cmpxchg_386_u32(ptr, old, new);
68924 +       }
68925 +       return old;
68926 +}
68927 +
68928 +#define cmpxchg(ptr,o,n)                                               \
68929 +({                                                                     \
68930 +       __typeof__(*(ptr)) __ret;                                       \
68931 +       if (likely(boot_cpu_data.x86 > 3))                              \
68932 +               __ret = __cmpxchg((ptr), (unsigned long)(o),            \
68933 +                                       (unsigned long)(n), sizeof(*(ptr))); \
68934 +       else                                                            \
68935 +               __ret = cmpxchg_386((ptr), (unsigned long)(o),          \
68936 +                                       (unsigned long)(n), sizeof(*(ptr))); \
68937 +       __ret;                                                          \
68938 +})
68939 +#endif
68940 +
68941 +#ifdef CONFIG_X86_CMPXCHG64
68942 +
68943 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
68944 +                                     unsigned long long new)
68945 +{
68946 +       unsigned long long prev;
68947 +       __asm__ __volatile__(LOCK "cmpxchg8b %3"
68948 +                            : "=A"(prev)
68949 +                            : "b"((unsigned long)new),
68950 +                              "c"((unsigned long)(new >> 32)),
68951 +                              "m"(*__xg(ptr)),
68952 +                              "0"(old)
68953 +                            : "memory");
68954 +       return prev;
68955 +}
68956 +
68957 +#define cmpxchg64(ptr,o,n)\
68958 +       ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
68959 +                                       (unsigned long long)(n)))
68960 +
68961 +#endif
68962 +    
68963 +#ifdef __KERNEL__
68964 +struct alt_instr { 
68965 +       __u8 *instr;            /* original instruction */
68966 +       __u8 *replacement;
68967 +       __u8  cpuid;            /* cpuid bit set for replacement */
68968 +       __u8  instrlen;         /* length of original instruction */
68969 +       __u8  replacementlen;   /* length of new instruction, <= instrlen */ 
68970 +       __u8  pad;
68971 +}; 
68972 +#endif
68973 +
68974 +/* 
68975 + * Alternative instructions for different CPU types or capabilities.
68976 + * 
68977 + * This allows to use optimized instructions even on generic binary
68978 + * kernels.
68979 + * 
68980 + * length of oldinstr must be longer or equal the length of newinstr
68981 + * It can be padded with nops as needed.
68982 + * 
68983 + * For non barrier like inlines please define new variants
68984 + * without volatile and memory clobber.
68985 + */
68986 +#define alternative(oldinstr, newinstr, feature)       \
68987 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                 \
68988 +                     ".section .altinstructions,\"a\"\n"            \
68989 +                     "  .align 4\n"                                   \
68990 +                     "  .long 661b\n"            /* label */          \
68991 +                     "  .long 663f\n"            /* new instruction */         \
68992 +                     "  .byte %c0\n"             /* feature bit */    \
68993 +                     "  .byte 662b-661b\n"       /* sourcelen */      \
68994 +                     "  .byte 664f-663f\n"       /* replacementlen */ \
68995 +                     ".previous\n"                                             \
68996 +                     ".section .altinstr_replacement,\"ax\"\n"                 \
68997 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */    \
68998 +                     ".previous" :: "i" (feature) : "memory")  
68999 +
69000 +/*
69001 + * Alternative inline assembly with input.
69002 + * 
69003 + * Pecularities:
69004 + * No memory clobber here. 
69005 + * Argument numbers start with 1.
69006 + * Best is to use constraints that are fixed size (like (%1) ... "r")
69007 + * If you use variable sized constraints like "m" or "g" in the 
69008 + * replacement maake sure to pad to the worst case length.
69009 + */
69010 +#define alternative_input(oldinstr, newinstr, feature, input...)               \
69011 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                            \
69012 +                     ".section .altinstructions,\"a\"\n"                       \
69013 +                     "  .align 4\n"                                            \
69014 +                     "  .long 661b\n"            /* label */                   \
69015 +                     "  .long 663f\n"            /* new instruction */         \
69016 +                     "  .byte %c0\n"             /* feature bit */             \
69017 +                     "  .byte 662b-661b\n"       /* sourcelen */               \
69018 +                     "  .byte 664f-663f\n"       /* replacementlen */          \
69019 +                     ".previous\n"                                             \
69020 +                     ".section .altinstr_replacement,\"ax\"\n"                 \
69021 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */        \
69022 +                     ".previous" :: "i" (feature), ##input)
69023 +
69024 +/*
69025 + * Force strict CPU ordering.
69026 + * And yes, this is required on UP too when we're talking
69027 + * to devices.
69028 + *
69029 + * For now, "wmb()" doesn't actually do anything, as all
69030 + * Intel CPU's follow what Intel calls a *Processor Order*,
69031 + * in which all writes are seen in the program order even
69032 + * outside the CPU.
69033 + *
69034 + * I expect future Intel CPU's to have a weaker ordering,
69035 + * but I'd also expect them to finally get their act together
69036 + * and add some real memory barriers if so.
69037 + *
69038 + * Some non intel clones support out of order store. wmb() ceases to be a
69039 + * nop for these.
69040 + */
69041
69042 +
69043 +/* 
69044 + * Actually only lfence would be needed for mb() because all stores done 
69045 + * by the kernel should be already ordered. But keep a full barrier for now. 
69046 + */
69047 +
69048 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
69049 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
69050 +
69051 +/**
69052 + * read_barrier_depends - Flush all pending reads that subsequents reads
69053 + * depend on.
69054 + *
69055 + * No data-dependent reads from memory-like regions are ever reordered
69056 + * over this barrier.  All reads preceding this primitive are guaranteed
69057 + * to access memory (but not necessarily other CPUs' caches) before any
69058 + * reads following this primitive that depend on the data return by
69059 + * any of the preceding reads.  This primitive is much lighter weight than
69060 + * rmb() on most CPUs, and is never heavier weight than is
69061 + * rmb().
69062 + *
69063 + * These ordering constraints are respected by both the local CPU
69064 + * and the compiler.
69065 + *
69066 + * Ordering is not guaranteed by anything other than these primitives,
69067 + * not even by data dependencies.  See the documentation for
69068 + * memory_barrier() for examples and URLs to more information.
69069 + *
69070 + * For example, the following code would force ordering (the initial
69071 + * value of "a" is zero, "b" is one, and "p" is "&a"):
69072 + *
69073 + * <programlisting>
69074 + *     CPU 0                           CPU 1
69075 + *
69076 + *     b = 2;
69077 + *     memory_barrier();
69078 + *     p = &b;                         q = p;
69079 + *                                     read_barrier_depends();
69080 + *                                     d = *q;
69081 + * </programlisting>
69082 + *
69083 + * because the read of "*q" depends on the read of "p" and these
69084 + * two reads are separated by a read_barrier_depends().  However,
69085 + * the following code, with the same initial values for "a" and "b":
69086 + *
69087 + * <programlisting>
69088 + *     CPU 0                           CPU 1
69089 + *
69090 + *     a = 2;
69091 + *     memory_barrier();
69092 + *     b = 3;                          y = b;
69093 + *                                     read_barrier_depends();
69094 + *                                     x = a;
69095 + * </programlisting>
69096 + *
69097 + * does not enforce ordering, since there is no data dependency between
69098 + * the read of "a" and the read of "b".  Therefore, on some CPUs, such
69099 + * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
69100 + * in cases like thiswhere there are no data dependencies.
69101 + **/
69102 +
69103 +#define read_barrier_depends() do { } while(0)
69104 +
69105 +#ifdef CONFIG_X86_OOSTORE
69106 +/* Actually there are no OOO store capable CPUs for now that do SSE, 
69107 +   but make it already an possibility. */
69108 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
69109 +#else
69110 +#define wmb()  __asm__ __volatile__ ("": : :"memory")
69111 +#endif
69112 +
69113 +#ifdef CONFIG_SMP
69114 +#define smp_wmb()      wmb()
69115 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
69116 +#define smp_alt_mb(instr)                                           \
69117 +__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
69118 +                    ".section __smp_alternatives,\"a\"\n"          \
69119 +                    ".long 6667b\n"                                \
69120 +                     ".long 6673f\n"                                \
69121 +                    ".previous\n"                                  \
69122 +                    ".section __smp_replacements,\"a\"\n"          \
69123 +                    "6673:.byte 6668b-6667b\n"                     \
69124 +                    ".byte 6670f-6669f\n"                          \
69125 +                    ".byte 6671f-6670f\n"                          \
69126 +                     ".byte 0\n"                                    \
69127 +                    ".byte %c0\n"                                  \
69128 +                    "6669:lock;addl $0,0(%%esp)\n"                 \
69129 +                    "6670:" instr "\n"                             \
69130 +                    "6671:\n"                                      \
69131 +                    ".previous\n"                                  \
69132 +                    :                                              \
69133 +                    : "i" (X86_FEATURE_XMM2)                       \
69134 +                    : "memory")
69135 +#define smp_rmb() smp_alt_mb("lfence")
69136 +#define smp_mb()  smp_alt_mb("mfence")
69137 +#define set_mb(var, value) do {                                     \
69138 +unsigned long __set_mb_temp;                                        \
69139 +__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
69140 +                    ".section __smp_alternatives,\"a\"\n"          \
69141 +                    ".long 6667b\n"                                \
69142 +                    ".long 6673f\n"                                \
69143 +                    ".previous\n"                                  \
69144 +                    ".section __smp_replacements,\"a\"\n"          \
69145 +                    "6673: .byte 6668b-6667b\n"                    \
69146 +                    ".byte 6670f-6669f\n"                          \
69147 +                    ".byte 0\n"                                    \
69148 +                    ".byte 6671f-6670f\n"                          \
69149 +                    ".byte -1\n"                                   \
69150 +                    "6669: xchg %1, %0\n"                          \
69151 +                    "6670:movl %1, %0\n"                           \
69152 +                    "6671:\n"                                      \
69153 +                    ".previous\n"                                  \
69154 +                    : "=m" (var), "=r" (__set_mb_temp)             \
69155 +                    : "1" (value)                                  \
69156 +                    : "memory"); } while (0)
69157 +#else
69158 +#define smp_rmb()      rmb()
69159 +#define smp_mb()       mb()
69160 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
69161 +#endif
69162 +#define smp_read_barrier_depends()     read_barrier_depends()
69163 +#else
69164 +#define smp_mb()       barrier()
69165 +#define smp_rmb()      barrier()
69166 +#define smp_wmb()      barrier()
69167 +#define smp_read_barrier_depends()     do { } while(0)
69168 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
69169 +#endif
69170 +
69171 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
69172 +
69173 +/* interrupt control.. */
69174 +
69175 +/* 
69176 + * The use of 'barrier' in the following reflects their use as local-lock
69177 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
69178 + * critical operations are executed. All critical operations must complete
69179 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
69180 + * includes these barriers, for example.
69181 + */
69182 +
69183 +#define __cli()                                                                \
69184 +do {                                                                   \
69185 +       vcpu_info_t *_vcpu;                                             \
69186 +       preempt_disable();                                              \
69187 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69188 +       _vcpu->evtchn_upcall_mask = 1;                                  \
69189 +       preempt_enable_no_resched();                                    \
69190 +       barrier();                                                      \
69191 +} while (0)
69192 +
69193 +#define __sti()                                                                \
69194 +do {                                                                   \
69195 +       vcpu_info_t *_vcpu;                                             \
69196 +       barrier();                                                      \
69197 +       preempt_disable();                                              \
69198 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69199 +       _vcpu->evtchn_upcall_mask = 0;                                  \
69200 +       barrier(); /* unmask then check (avoid races) */                \
69201 +       if (unlikely(_vcpu->evtchn_upcall_pending))                     \
69202 +               force_evtchn_callback();                                \
69203 +       preempt_enable();                                               \
69204 +} while (0)
69205 +
69206 +#define __save_flags(x)                                                        \
69207 +do {                                                                   \
69208 +       vcpu_info_t *_vcpu;                                             \
69209 +       preempt_disable();                                              \
69210 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69211 +       (x) = _vcpu->evtchn_upcall_mask;                                \
69212 +       preempt_enable();                                               \
69213 +} while (0)
69214 +
69215 +#define __restore_flags(x)                                             \
69216 +do {                                                                   \
69217 +       vcpu_info_t *_vcpu;                                             \
69218 +       barrier();                                                      \
69219 +       preempt_disable();                                              \
69220 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69221 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
69222 +               barrier(); /* unmask then check (avoid races) */        \
69223 +               if (unlikely(_vcpu->evtchn_upcall_pending))             \
69224 +                       force_evtchn_callback();                        \
69225 +               preempt_enable();                                       \
69226 +       } else                                                          \
69227 +               preempt_enable_no_resched();                            \
69228 +} while (0)
69229 +
69230 +#define safe_halt()            ((void)0)
69231 +#define halt()                 ((void)0)
69232 +
69233 +#define __save_and_cli(x)                                              \
69234 +do {                                                                   \
69235 +       vcpu_info_t *_vcpu;                                             \
69236 +       preempt_disable();                                              \
69237 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69238 +       (x) = _vcpu->evtchn_upcall_mask;                                \
69239 +       _vcpu->evtchn_upcall_mask = 1;                                  \
69240 +       preempt_enable_no_resched();                                    \
69241 +       barrier();                                                      \
69242 +} while (0)
69243 +
69244 +#define local_irq_save(x)      __save_and_cli(x)
69245 +#define local_irq_restore(x)   __restore_flags(x)
69246 +#define local_save_flags(x)    __save_flags(x)
69247 +#define local_irq_disable()    __cli()
69248 +#define local_irq_enable()     __sti()
69249 +
69250 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
69251 +#define irqs_disabled()                                                        \
69252 +({     int ___x;                                                       \
69253 +       vcpu_info_t *_vcpu;                                             \
69254 +       preempt_disable();                                              \
69255 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69256 +       ___x = (_vcpu->evtchn_upcall_mask != 0);                        \
69257 +       preempt_enable_no_resched();                                    \
69258 +       ___x; })
69259 +
69260 +/*
69261 + * disable hlt during certain critical i/o operations
69262 + */
69263 +#define HAVE_DISABLE_HLT
69264 +void disable_hlt(void);
69265 +void enable_hlt(void);
69266 +
69267 +extern int es7000_plat;
69268 +void cpu_idle_wait(void);
69269 +
69270 +/*
69271 + * On SMP systems, when the scheduler does migration-cost autodetection,
69272 + * it needs a way to flush as much of the CPU's caches as possible:
69273 + */
69274 +static inline void sched_cacheflush(void)
69275 +{
69276 +       wbinvd();
69277 +}
69278 +
69279 +extern unsigned long arch_align_stack(unsigned long sp);
69280 +
69281 +#endif
69282 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/tlbflush.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/tlbflush.h
69283 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/tlbflush.h 1970-01-01 01:00:00.000000000 +0100
69284 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/tlbflush.h    2006-04-10 00:05:52.000000000 +0200
69285 @@ -0,0 +1,102 @@
69286 +#ifndef _I386_TLBFLUSH_H
69287 +#define _I386_TLBFLUSH_H
69288 +
69289 +#include <linux/config.h>
69290 +#include <linux/mm.h>
69291 +#include <asm/processor.h>
69292 +
69293 +#define __flush_tlb() xen_tlb_flush()
69294 +#define __flush_tlb_global() xen_tlb_flush()
69295 +#define __flush_tlb_all() xen_tlb_flush()
69296 +
69297 +extern unsigned long pgkern_mask;
69298 +
69299 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
69300 +
69301 +#define __flush_tlb_single(addr) xen_invlpg(addr)
69302 +
69303 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
69304 +
69305 +/*
69306 + * TLB flushing:
69307 + *
69308 + *  - flush_tlb() flushes the current mm struct TLBs
69309 + *  - flush_tlb_all() flushes all processes TLBs
69310 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
69311 + *  - flush_tlb_page(vma, vmaddr) flushes one page
69312 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
69313 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
69314 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
69315 + *
69316 + * ..but the i386 has somewhat limited tlb flushing capabilities,
69317 + * and page-granular flushes are available only on i486 and up.
69318 + */
69319 +
69320 +#ifndef CONFIG_SMP
69321 +
69322 +#define flush_tlb() __flush_tlb()
69323 +#define flush_tlb_all() __flush_tlb_all()
69324 +#define local_flush_tlb() __flush_tlb()
69325 +
69326 +static inline void flush_tlb_mm(struct mm_struct *mm)
69327 +{
69328 +       if (mm == current->active_mm)
69329 +               __flush_tlb();
69330 +}
69331 +
69332 +static inline void flush_tlb_page(struct vm_area_struct *vma,
69333 +       unsigned long addr)
69334 +{
69335 +       if (vma->vm_mm == current->active_mm)
69336 +               __flush_tlb_one(addr);
69337 +}
69338 +
69339 +static inline void flush_tlb_range(struct vm_area_struct *vma,
69340 +       unsigned long start, unsigned long end)
69341 +{
69342 +       if (vma->vm_mm == current->active_mm)
69343 +               __flush_tlb();
69344 +}
69345 +
69346 +#else
69347 +
69348 +#include <asm/smp.h>
69349 +
69350 +#define local_flush_tlb() \
69351 +       __flush_tlb()
69352 +
69353 +extern void flush_tlb_all(void);
69354 +extern void flush_tlb_current_task(void);
69355 +extern void flush_tlb_mm(struct mm_struct *);
69356 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
69357 +
69358 +#define flush_tlb()    flush_tlb_current_task()
69359 +
69360 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
69361 +{
69362 +       flush_tlb_mm(vma->vm_mm);
69363 +}
69364 +
69365 +#define TLBSTATE_OK    1
69366 +#define TLBSTATE_LAZY  2
69367 +
69368 +struct tlb_state
69369 +{
69370 +       struct mm_struct *active_mm;
69371 +       int state;
69372 +       char __cacheline_padding[L1_CACHE_BYTES-8];
69373 +};
69374 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
69375 +
69376 +
69377 +#endif
69378 +
69379 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
69380 +
69381 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
69382 +                                     unsigned long start, unsigned long end)
69383 +{
69384 +       /* i386 does not keep any page table caches in TLB */
69385 +}
69386 +
69387 +#endif /* _I386_TLBFLUSH_H */
69388 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/vga.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/vga.h
69389 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/asm/vga.h      1970-01-01 01:00:00.000000000 +0100
69390 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/asm/vga.h 2006-04-10 00:05:52.000000000 +0200
69391 @@ -0,0 +1,20 @@
69392 +/*
69393 + *     Access to VGA videoram
69394 + *
69395 + *     (c) 1998 Martin Mares <mj@ucw.cz>
69396 + */
69397 +
69398 +#ifndef _LINUX_ASM_VGA_H_
69399 +#define _LINUX_ASM_VGA_H_
69400 +
69401 +/*
69402 + *     On the PC, we can just recalculate addresses and then
69403 + *     access the videoram directly without any black magic.
69404 + */
69405 +
69406 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
69407 +
69408 +#define vga_readb(x) (*(x))
69409 +#define vga_writeb(x,y) (*(y) = (x))
69410 +
69411 +#endif
69412 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/irq_vectors.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/irq_vectors.h
69413 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/irq_vectors.h  1970-01-01 01:00:00.000000000 +0100
69414 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/irq_vectors.h     2006-04-10 00:05:52.000000000 +0200
69415 @@ -0,0 +1,125 @@
69416 +/*
69417 + * This file should contain #defines for all of the interrupt vector
69418 + * numbers used by this architecture.
69419 + *
69420 + * In addition, there are some standard defines:
69421 + *
69422 + *     FIRST_EXTERNAL_VECTOR:
69423 + *             The first free place for external interrupts
69424 + *
69425 + *     SYSCALL_VECTOR:
69426 + *             The IRQ vector a syscall makes the user to kernel transition
69427 + *             under.
69428 + *
69429 + *     TIMER_IRQ:
69430 + *             The IRQ number the timer interrupt comes in at.
69431 + *
69432 + *     NR_IRQS:
69433 + *             The total number of interrupt vectors (including all the
69434 + *             architecture specific interrupts) needed.
69435 + *
69436 + */                    
69437 +#ifndef _ASM_IRQ_VECTORS_H
69438 +#define _ASM_IRQ_VECTORS_H
69439 +
69440 +/*
69441 + * IDT vectors usable for external interrupt sources start
69442 + * at 0x20:
69443 + */
69444 +#define FIRST_EXTERNAL_VECTOR  0x20
69445 +
69446 +#define SYSCALL_VECTOR         0x80
69447 +
69448 +/*
69449 + * Vectors 0x20-0x2f are used for ISA interrupts.
69450 + */
69451 +
69452 +#if 0
69453 +/*
69454 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
69455 + *
69456 + *  some of the following vectors are 'rare', they are merged
69457 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
69458 + *  TLB, reschedule and local APIC vectors are performance-critical.
69459 + *
69460 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
69461 + */
69462 +#define SPURIOUS_APIC_VECTOR   0xff
69463 +#define ERROR_APIC_VECTOR      0xfe
69464 +#define INVALIDATE_TLB_VECTOR  0xfd
69465 +#define RESCHEDULE_VECTOR      0xfc
69466 +#define CALL_FUNCTION_VECTOR   0xfb
69467 +
69468 +#define THERMAL_APIC_VECTOR    0xf0
69469 +/*
69470 + * Local APIC timer IRQ vector is on a different priority level,
69471 + * to work around the 'lost local interrupt if more than 2 IRQ
69472 + * sources per level' errata.
69473 + */
69474 +#define LOCAL_TIMER_VECTOR     0xef
69475 +#endif
69476 +
69477 +#define SPURIOUS_APIC_VECTOR   0xff
69478 +#define ERROR_APIC_VECTOR      0xfe
69479 +
69480 +/*
69481 + * First APIC vector available to drivers: (vectors 0x30-0xee)
69482 + * we start at 0x31 to spread out vectors evenly between priority
69483 + * levels. (0x80 is the syscall vector)
69484 + */
69485 +#define FIRST_DEVICE_VECTOR    0x31
69486 +#define FIRST_SYSTEM_VECTOR    0xef
69487 +
69488 +/*
69489 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
69490 + * Right now the APIC is mostly only used for SMP.
69491 + * 256 vectors is an architectural limit. (we can have
69492 + * more than 256 devices theoretically, but they will
69493 + * have to use shared interrupts)
69494 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
69495 + * the usable vector space is 0x20-0xff (224 vectors)
69496 + */
69497 +
69498 +#define RESCHEDULE_VECTOR      0
69499 +#define CALL_FUNCTION_VECTOR   1
69500 +#define NR_IPIS                        2
69501 +
69502 +/*
69503 + * The maximum number of vectors supported by i386 processors
69504 + * is limited to 256. For processors other than i386, NR_VECTORS
69505 + * should be changed accordingly.
69506 + */
69507 +#define NR_VECTORS 256
69508 +
69509 +#define FPU_IRQ                        13
69510 +
69511 +#define        FIRST_VM86_IRQ          3
69512 +#define LAST_VM86_IRQ          15
69513 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
69514 +
69515 +/*
69516 + * The flat IRQ space is divided into two regions:
69517 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
69518 + *     if we have physical device-access privilege. This region is at the 
69519 + *     start of the IRQ space so that existing device drivers do not need
69520 + *     to be modified to translate physical IRQ numbers into our IRQ space.
69521 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
69522 + *     are bound using the provided bind/unbind functions.
69523 + */
69524 +
69525 +#define PIRQ_BASE              0
69526 +#define NR_PIRQS               256
69527 +
69528 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
69529 +#define NR_DYNIRQS             256
69530 +
69531 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
69532 +#define NR_IRQ_VECTORS         NR_IRQS
69533 +
69534 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
69535 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
69536 +
69537 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
69538 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
69539 +
69540 +#endif /* _ASM_IRQ_VECTORS_H */
69541 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/mach_traps.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/mach_traps.h
69542 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/mach_traps.h   1970-01-01 01:00:00.000000000 +0100
69543 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/mach_traps.h      2006-04-10 00:05:52.000000000 +0200
69544 @@ -0,0 +1,33 @@
69545 +/*
69546 + *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
69547 + *
69548 + *  Machine specific NMI handling for Xen
69549 + */
69550 +#ifndef _MACH_TRAPS_H
69551 +#define _MACH_TRAPS_H
69552 +
69553 +#include <linux/bitops.h>
69554 +#include <xen/interface/nmi.h>
69555 +
69556 +static inline void clear_mem_error(unsigned char reason) {}
69557 +static inline void clear_io_check_error(unsigned char reason) {}
69558 +
69559 +static inline unsigned char get_nmi_reason(void)
69560 +{
69561 +       shared_info_t *s = HYPERVISOR_shared_info;
69562 +       unsigned char reason = 0;
69563 +
69564 +       /* construct a value which looks like it came from
69565 +        * port 0x61.
69566 +        */
69567 +       if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
69568 +               reason |= 0x40;
69569 +       if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
69570 +               reason |= 0x80;
69571 +
69572 +        return reason;
69573 +}
69574 +
69575 +static inline void reassert_nmi(void) {}
69576 +
69577 +#endif /* !_MACH_TRAPS_H */
69578 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/setup_arch_post.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/setup_arch_post.h
69579 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/setup_arch_post.h      1970-01-01 01:00:00.000000000 +0100
69580 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/setup_arch_post.h 2006-04-10 00:05:52.000000000 +0200
69581 @@ -0,0 +1,45 @@
69582 +/**
69583 + * machine_specific_memory_setup - Hook for machine specific memory setup.
69584 + *
69585 + * Description:
69586 + *     This is included late in kernel/setup.c so that it can make
69587 + *     use of all of the static functions.
69588 + **/
69589 +
69590 +static char * __init machine_specific_memory_setup(void)
69591 +{
69592 +       unsigned long max_pfn = xen_start_info->nr_pages;
69593 +
69594 +       e820.nr_map = 0;
69595 +       add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
69596 +
69597 +       return "Xen";
69598 +}
69599 +
69600 +extern void hypervisor_callback(void);
69601 +extern void failsafe_callback(void);
69602 +extern void nmi(void);
69603 +
69604 +static void __init machine_specific_arch_setup(void)
69605 +{
69606 +       struct xen_platform_parameters pp;
69607 +       struct xennmi_callback cb;
69608 +
69609 +       if (xen_feature(XENFEAT_auto_translated_physmap) &&
69610 +           xen_start_info->shared_info < xen_start_info->nr_pages) {
69611 +               HYPERVISOR_shared_info =
69612 +                       (shared_info_t *)__va(xen_start_info->shared_info);
69613 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
69614 +       }
69615 +
69616 +       HYPERVISOR_set_callbacks(
69617 +           __KERNEL_CS, (unsigned long)hypervisor_callback,
69618 +           __KERNEL_CS, (unsigned long)failsafe_callback);
69619 +
69620 +       cb.handler_address = (unsigned long)&nmi;
69621 +       HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
69622 +
69623 +       if (HYPERVISOR_xen_version(XENVER_platform_parameters,
69624 +                                  &pp) == 0)
69625 +               set_fixaddr_top(pp.virt_start - PAGE_SIZE);
69626 +}
69627 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/mach-xen/setup_arch_pre.h tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/setup_arch_pre.h
69628 --- ref-linux-2.6.16.9/include/asm-i386/mach-xen/setup_arch_pre.h       1970-01-01 01:00:00.000000000 +0100
69629 +++ tmp-linux-2.6-xen.patch/include/asm-i386/mach-xen/setup_arch_pre.h  2006-04-10 00:05:52.000000000 +0200
69630 @@ -0,0 +1,5 @@
69631 +/* Hook to call BIOS initialisation function */
69632 +
69633 +#define ARCH_SETUP machine_specific_arch_setup();
69634 +
69635 +static void __init machine_specific_arch_setup(void);
69636 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/page.h tmp-linux-2.6-xen.patch/include/asm-i386/page.h
69637 --- ref-linux-2.6.16.9/include/asm-i386/page.h  2006-04-19 08:10:14.000000000 +0200
69638 +++ tmp-linux-2.6-xen.patch/include/asm-i386/page.h     2006-04-10 00:05:52.000000000 +0200
69639 @@ -121,7 +121,7 @@ extern int page_is_ram(unsigned long pag
69640  
69641  #define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
69642  #define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
69643 -#define MAXMEM                 (-__PAGE_OFFSET-__VMALLOC_RESERVE)
69644 +#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
69645  #define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
69646  #define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
69647  #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
69648 @@ -139,6 +139,8 @@ extern int page_is_ram(unsigned long pag
69649         ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
69650                  VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
69651  
69652 +#define __HAVE_ARCH_GATE_AREA 1
69653 +
69654  #endif /* __KERNEL__ */
69655  
69656  #include <asm-generic/page.h>
69657 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/rwsem.h tmp-linux-2.6-xen.patch/include/asm-i386/rwsem.h
69658 --- ref-linux-2.6.16.9/include/asm-i386/rwsem.h 2006-04-19 08:10:14.000000000 +0200
69659 +++ tmp-linux-2.6-xen.patch/include/asm-i386/rwsem.h    2006-04-10 00:05:52.000000000 +0200
69660 @@ -40,6 +40,7 @@
69661  
69662  #include <linux/list.h>
69663  #include <linux/spinlock.h>
69664 +#include <asm/smp_alt.h>
69665  
69666  struct rwsem_waiter;
69667  
69668 @@ -99,7 +100,7 @@ static inline void __down_read(struct rw
69669  {
69670         __asm__ __volatile__(
69671                 "# beginning down_read\n\t"
69672 -LOCK_PREFIX    "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
69673 +LOCK           "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
69674                 "  js        2f\n\t" /* jump if we weren't granted the lock */
69675                 "1:\n\t"
69676                 LOCK_SECTION_START("")
69677 @@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
69678                 "  movl      %1,%2\n\t"
69679                 "  addl      %3,%2\n\t"
69680                 "  jle       2f\n\t"
69681 -LOCK_PREFIX    "  cmpxchgl  %2,%0\n\t"
69682 +LOCK           "  cmpxchgl  %2,%0\n\t"
69683                 "  jnz       1b\n\t"
69684                 "2:\n\t"
69685                 "# ending __down_read_trylock\n\t"
69686 @@ -150,7 +151,7 @@ static inline void __down_write(struct r
69687         tmp = RWSEM_ACTIVE_WRITE_BIAS;
69688         __asm__ __volatile__(
69689                 "# beginning down_write\n\t"
69690 -LOCK_PREFIX    "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
69691 +LOCK           "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
69692                 "  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
69693                 "  jnz       2f\n\t" /* jump if we weren't granted the lock */
69694                 "1:\n\t"
69695 @@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
69696         __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
69697         __asm__ __volatile__(
69698                 "# beginning __up_read\n\t"
69699 -LOCK_PREFIX    "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
69700 +LOCK           "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
69701                 "  js        2f\n\t" /* jump if the lock is being waited upon */
69702                 "1:\n\t"
69703                 LOCK_SECTION_START("")
69704 @@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
69705         __asm__ __volatile__(
69706                 "# beginning __up_write\n\t"
69707                 "  movl      %2,%%edx\n\t"
69708 -LOCK_PREFIX    "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
69709 +LOCK           "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
69710                 "  jnz       2f\n\t" /* jump if the lock is being waited upon */
69711                 "1:\n\t"
69712                 LOCK_SECTION_START("")
69713 @@ -239,7 +240,7 @@ static inline void __downgrade_write(str
69714  {
69715         __asm__ __volatile__(
69716                 "# beginning __downgrade_write\n\t"
69717 -LOCK_PREFIX    "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
69718 +LOCK           "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
69719                 "  js        2f\n\t" /* jump if the lock is being waited upon */
69720                 "1:\n\t"
69721                 LOCK_SECTION_START("")
69722 @@ -263,7 +264,7 @@ LOCK_PREFIX "  addl      %2,(%%eax)\n\t"
69723  static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
69724  {
69725         __asm__ __volatile__(
69726 -LOCK_PREFIX    "addl %1,%0"
69727 +LOCK             "addl %1,%0"
69728                 : "=m"(sem->count)
69729                 : "ir"(delta), "m"(sem->count));
69730  }
69731 @@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
69732         int tmp = delta;
69733  
69734         __asm__ __volatile__(
69735 -LOCK_PREFIX    "xadd %0,(%2)"
69736 +LOCK             "xadd %0,(%2)"
69737                 : "+r"(tmp), "=m"(sem->count)
69738                 : "r"(sem), "m"(sem->count)
69739                 : "memory");
69740 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/smp_alt.h tmp-linux-2.6-xen.patch/include/asm-i386/smp_alt.h
69741 --- ref-linux-2.6.16.9/include/asm-i386/smp_alt.h       1970-01-01 01:00:00.000000000 +0100
69742 +++ tmp-linux-2.6-xen.patch/include/asm-i386/smp_alt.h  2006-04-10 00:05:52.000000000 +0200
69743 @@ -0,0 +1,32 @@
69744 +#ifndef __ASM_SMP_ALT_H__
69745 +#define __ASM_SMP_ALT_H__
69746 +
69747 +#include <linux/config.h>
69748 +
69749 +#ifdef CONFIG_SMP
69750 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
69751 +#define LOCK \
69752 +        "6677: nop\n" \
69753 +       ".section __smp_alternatives,\"a\"\n" \
69754 +       ".long 6677b\n" \
69755 +       ".long 6678f\n" \
69756 +       ".previous\n" \
69757 +       ".section __smp_replacements,\"a\"\n" \
69758 +       "6678: .byte 1\n" \
69759 +       ".byte 1\n" \
69760 +       ".byte 0\n" \
69761 +        ".byte 1\n" \
69762 +       ".byte -1\n" \
69763 +       "lock\n" \
69764 +       "nop\n" \
69765 +       ".previous\n"
69766 +void prepare_for_smp(void);
69767 +void unprepare_for_smp(void);
69768 +#else
69769 +#define LOCK "lock ; "
69770 +#endif
69771 +#else
69772 +#define LOCK ""
69773 +#endif
69774 +
69775 +#endif /* __ASM_SMP_ALT_H__ */
69776 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/spinlock.h tmp-linux-2.6-xen.patch/include/asm-i386/spinlock.h
69777 --- ref-linux-2.6.16.9/include/asm-i386/spinlock.h      2006-04-19 08:10:14.000000000 +0200
69778 +++ tmp-linux-2.6-xen.patch/include/asm-i386/spinlock.h 2006-04-10 00:05:52.000000000 +0200
69779 @@ -6,6 +6,7 @@
69780  #include <asm/page.h>
69781  #include <linux/config.h>
69782  #include <linux/compiler.h>
69783 +#include <asm/smp_alt.h>
69784  
69785  /*
69786   * Your basic SMP spinlocks, allowing only a single CPU anywhere
69787 @@ -22,8 +23,9 @@
69788                 (*(volatile signed char *)(&(x)->slock) <= 0)
69789  
69790  #define __raw_spin_lock_string \
69791 -       "\n1:\t" \
69792 -       "lock ; decb %0\n\t" \
69793 +       "\n1:\n" \
69794 +       LOCK \
69795 +       "decb %0\n\t" \
69796         "jns 3f\n" \
69797         "2:\t" \
69798         "rep;nop\n\t" \
69799 @@ -33,8 +35,9 @@
69800         "3:\n\t"
69801  
69802  #define __raw_spin_lock_string_flags \
69803 -       "\n1:\t" \
69804 -       "lock ; decb %0\n\t" \
69805 +       "\n1:\n" \
69806 +       LOCK \
69807 +       "decb %0\n\t" \
69808         "jns 4f\n\t" \
69809         "2:\t" \
69810         "testl $0x200, %1\n\t" \
69811 @@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
69812  static inline int __raw_spin_trylock(raw_spinlock_t *lock)
69813  {
69814         char oldval;
69815 +#ifdef CONFIG_SMP_ALTERNATIVES
69816 +       __asm__ __volatile__(
69817 +               "1:movb %1,%b0\n"
69818 +               "movb $0,%1\n"
69819 +               "2:"
69820 +               ".section __smp_alternatives,\"a\"\n"
69821 +               ".long 1b\n"
69822 +               ".long 3f\n"
69823 +               ".previous\n"
69824 +               ".section __smp_replacements,\"a\"\n"
69825 +               "3: .byte 2b - 1b\n"
69826 +               ".byte 5f-4f\n"
69827 +               ".byte 0\n"
69828 +               ".byte 6f-5f\n"
69829 +               ".byte -1\n"
69830 +               "4: xchgb %b0,%1\n"
69831 +               "5: movb %1,%b0\n"
69832 +               "movb $0,%1\n"
69833 +               "6:\n"
69834 +               ".previous\n"
69835 +               :"=q" (oldval), "=m" (lock->slock)
69836 +               :"0" (0) : "memory");
69837 +#else
69838         __asm__ __volatile__(
69839                 "xchgb %b0,%1"
69840                 :"=q" (oldval), "=m" (lock->slock)
69841                 :"0" (0) : "memory");
69842 +#endif
69843         return oldval > 0;
69844  }
69845  
69846 @@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
69847  
69848  static inline void __raw_read_unlock(raw_rwlock_t *rw)
69849  {
69850 -       asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
69851 +       asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
69852  }
69853  
69854  static inline void __raw_write_unlock(raw_rwlock_t *rw)
69855  {
69856 -       asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
69857 +       asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
69858                                  : "=m" (rw->lock) : : "memory");
69859  }
69860  
69861 diff -Nurp ref-linux-2.6.16.9/include/asm-i386/system.h tmp-linux-2.6-xen.patch/include/asm-i386/system.h
69862 --- ref-linux-2.6.16.9/include/asm-i386/system.h        2006-04-19 08:10:14.000000000 +0200
69863 +++ tmp-linux-2.6-xen.patch/include/asm-i386/system.h   2006-04-10 00:05:52.000000000 +0200
69864 @@ -5,7 +5,7 @@
69865  #include <linux/kernel.h>
69866  #include <asm/segment.h>
69867  #include <asm/cpufeature.h>
69868 -#include <linux/bitops.h> /* for LOCK_PREFIX */
69869 +#include <asm/smp_alt.h>
69870  
69871  #ifdef __KERNEL__
69872  
69873 @@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
69874         unsigned long prev;
69875         switch (size) {
69876         case 1:
69877 -               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
69878 +               __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
69879                                      : "=a"(prev)
69880                                      : "q"(new), "m"(*__xg(ptr)), "0"(old)
69881                                      : "memory");
69882                 return prev;
69883         case 2:
69884 -               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
69885 +               __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
69886                                      : "=a"(prev)
69887                                      : "r"(new), "m"(*__xg(ptr)), "0"(old)
69888                                      : "memory");
69889                 return prev;
69890         case 4:
69891 -               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
69892 +               __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
69893                                      : "=a"(prev)
69894                                      : "r"(new), "m"(*__xg(ptr)), "0"(old)
69895                                      : "memory");
69896 @@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
69897                                       unsigned long long new)
69898  {
69899         unsigned long long prev;
69900 -       __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
69901 +       __asm__ __volatile__(LOCK "cmpxchg8b %3"
69902                              : "=A"(prev)
69903                              : "b"((unsigned long)new),
69904                                "c"((unsigned long)(new >> 32)),
69905 @@ -503,11 +503,55 @@ struct alt_instr { 
69906  #endif
69907  
69908  #ifdef CONFIG_SMP
69909 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
69910 +#define smp_alt_mb(instr)                                           \
69911 +__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
69912 +                    ".section __smp_alternatives,\"a\"\n"          \
69913 +                    ".long 6667b\n"                                \
69914 +                     ".long 6673f\n"                                \
69915 +                    ".previous\n"                                  \
69916 +                    ".section __smp_replacements,\"a\"\n"          \
69917 +                    "6673:.byte 6668b-6667b\n"                     \
69918 +                    ".byte 6670f-6669f\n"                          \
69919 +                    ".byte 6671f-6670f\n"                          \
69920 +                     ".byte 0\n"                                    \
69921 +                    ".byte %c0\n"                                  \
69922 +                    "6669:lock;addl $0,0(%%esp)\n"                 \
69923 +                    "6670:" instr "\n"                             \
69924 +                    "6671:\n"                                      \
69925 +                    ".previous\n"                                  \
69926 +                    :                                              \
69927 +                    : "i" (X86_FEATURE_XMM2)                       \
69928 +                    : "memory")
69929 +#define smp_mb()  smp_alt_mb("mfence")
69930 +#define smp_rmb() smp_alt_mb("lfence")
69931 +#define set_mb(var, value) do {                                     \
69932 +unsigned long __set_mb_temp;                                        \
69933 +__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
69934 +                    ".section __smp_alternatives,\"a\"\n"          \
69935 +                    ".long 6667b\n"                                \
69936 +                    ".long 6673f\n"                                \
69937 +                    ".previous\n"                                  \
69938 +                    ".section __smp_replacements,\"a\"\n"          \
69939 +                    "6673: .byte 6668b-6667b\n"                    \
69940 +                    ".byte 6670f-6669f\n"                          \
69941 +                    ".byte 0\n"                                    \
69942 +                    ".byte 6671f-6670f\n"                          \
69943 +                    ".byte -1\n"                                   \
69944 +                    "6669: xchg %1, %0\n"                          \
69945 +                    "6670:movl %1, %0\n"                           \
69946 +                    "6671:\n"                                      \
69947 +                    ".previous\n"                                  \
69948 +                    : "=m" (var), "=r" (__set_mb_temp)             \
69949 +                    : "1" (value)                                  \
69950 +                    : "memory"); } while (0)
69951 +#else
69952  #define smp_mb()       mb()
69953  #define smp_rmb()      rmb()
69954 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
69955 +#endif
69956  #define smp_wmb()      wmb()
69957  #define smp_read_barrier_depends()     read_barrier_depends()
69958 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
69959  #else
69960  #define smp_mb()       barrier()
69961  #define smp_rmb()      barrier()
69962 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/fixmap.h tmp-linux-2.6-xen.patch/include/asm-ia64/fixmap.h
69963 --- ref-linux-2.6.16.9/include/asm-ia64/fixmap.h        1970-01-01 01:00:00.000000000 +0100
69964 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/fixmap.h   2006-04-10 00:05:52.000000000 +0200
69965 @@ -0,0 +1,2 @@
69966 +#define clear_fixmap(x)        do {} while (0)
69967 +#define        set_fixmap(x,y) do {} while (0)
69968 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/gcc_intrin.h tmp-linux-2.6-xen.patch/include/asm-ia64/gcc_intrin.h
69969 --- ref-linux-2.6.16.9/include/asm-ia64/gcc_intrin.h    2006-04-19 08:10:14.000000000 +0200
69970 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/gcc_intrin.h       2006-04-10 00:05:52.000000000 +0200
69971 @@ -26,7 +26,7 @@ extern void ia64_bad_param_for_getreg (v
69972  
69973  register unsigned long ia64_r13 asm ("r13") __attribute_used__;
69974  
69975 -#define ia64_setreg(regnum, val)                                               \
69976 +#define __ia64_setreg(regnum, val)                                             \
69977  ({                                                                             \
69978         switch (regnum) {                                                       \
69979             case _IA64_REG_PSR_L:                                               \
69980 @@ -55,7 +55,7 @@ register unsigned long ia64_r13 asm ("r1
69981         }                                                                       \
69982  })
69983  
69984 -#define ia64_getreg(regnum)                                                    \
69985 +#define __ia64_getreg(regnum)                                                  \
69986  ({                                                                             \
69987         __u64 ia64_intri_res;                                                   \
69988                                                                                 \
69989 @@ -92,7 +92,7 @@ register unsigned long ia64_r13 asm ("r1
69990  
69991  #define ia64_hint_pause 0
69992  
69993 -#define ia64_hint(mode)                                                \
69994 +#define __ia64_hint(mode)                                              \
69995  ({                                                             \
69996         switch (mode) {                                         \
69997         case ia64_hint_pause:                                   \
69998 @@ -374,7 +374,7 @@ register unsigned long ia64_r13 asm ("r1
69999  
70000  #define ia64_invala() asm volatile ("invala" ::: "memory")
70001  
70002 -#define ia64_thash(addr)                                                       \
70003 +#define __ia64_thash(addr)                                                     \
70004  ({                                                                             \
70005         __u64 ia64_intri_res;                                                   \
70006         asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr));       \
70007 @@ -394,18 +394,18 @@ register unsigned long ia64_r13 asm ("r1
70008  
70009  #define ia64_nop(x)    asm volatile ("nop %0"::"i"(x));
70010  
70011 -#define ia64_itci(addr)        asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
70012 +#define __ia64_itci(addr)      asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
70013  
70014 -#define ia64_itcd(addr)        asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
70015 +#define __ia64_itcd(addr)      asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
70016  
70017  
70018 -#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                                \
70019 +#define __ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                      \
70020                                              :: "r"(trnum), "r"(addr) : "memory")
70021  
70022 -#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                                \
70023 +#define __ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                      \
70024                                              :: "r"(trnum), "r"(addr) : "memory")
70025  
70026 -#define ia64_tpa(addr)                                                         \
70027 +#define __ia64_tpa(addr)                                                       \
70028  ({                                                                             \
70029         __u64 ia64_pa;                                                          \
70030         asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory");    \
70031 @@ -415,22 +415,22 @@ register unsigned long ia64_r13 asm ("r1
70032  #define __ia64_set_dbr(index, val)                                             \
70033         asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
70034  
70035 -#define ia64_set_ibr(index, val)                                               \
70036 +#define __ia64_set_ibr(index, val)                                             \
70037         asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
70038  
70039 -#define ia64_set_pkr(index, val)                                               \
70040 +#define __ia64_set_pkr(index, val)                                             \
70041         asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
70042  
70043 -#define ia64_set_pmc(index, val)                                               \
70044 +#define __ia64_set_pmc(index, val)                                             \
70045         asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
70046  
70047 -#define ia64_set_pmd(index, val)                                               \
70048 +#define __ia64_set_pmd(index, val)                                             \
70049         asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
70050  
70051 -#define ia64_set_rr(index, val)                                                        \
70052 +#define __ia64_set_rr(index, val)                                                      \
70053         asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
70054  
70055 -#define ia64_get_cpuid(index)                                                          \
70056 +#define __ia64_get_cpuid(index)                                                                \
70057  ({                                                                                     \
70058         __u64 ia64_intri_res;                                                           \
70059         asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index));        \
70060 @@ -444,21 +444,21 @@ register unsigned long ia64_r13 asm ("r1
70061         ia64_intri_res;                                                         \
70062  })
70063  
70064 -#define ia64_get_ibr(index)                                                    \
70065 +#define __ia64_get_ibr(index)                                                  \
70066  ({                                                                             \
70067         __u64 ia64_intri_res;                                                   \
70068         asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70069         ia64_intri_res;                                                         \
70070  })
70071  
70072 -#define ia64_get_pkr(index)                                                    \
70073 +#define __ia64_get_pkr(index)                                                  \
70074  ({                                                                             \
70075         __u64 ia64_intri_res;                                                   \
70076         asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70077         ia64_intri_res;                                                         \
70078  })
70079  
70080 -#define ia64_get_pmc(index)                                                    \
70081 +#define __ia64_get_pmc(index)                                                  \
70082  ({                                                                             \
70083         __u64 ia64_intri_res;                                                   \
70084         asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70085 @@ -466,48 +466,48 @@ register unsigned long ia64_r13 asm ("r1
70086  })
70087  
70088  
70089 -#define ia64_get_pmd(index)                                                    \
70090 +#define __ia64_get_pmd(index)                                                  \
70091  ({                                                                             \
70092         __u64 ia64_intri_res;                                                   \
70093         asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70094         ia64_intri_res;                                                         \
70095  })
70096  
70097 -#define ia64_get_rr(index)                                                     \
70098 +#define __ia64_get_rr(index)                                                   \
70099  ({                                                                             \
70100         __u64 ia64_intri_res;                                                   \
70101         asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index));    \
70102         ia64_intri_res;                                                         \
70103  })
70104  
70105 -#define ia64_fc(addr)  asm volatile ("fc %0" :: "r"(addr) : "memory")
70106 +#define __ia64_fc(addr)        asm volatile ("fc %0" :: "r"(addr) : "memory")
70107  
70108  
70109  #define ia64_sync_i()  asm volatile (";; sync.i" ::: "memory")
70110  
70111 -#define ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
70112 -#define ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
70113 +#define __ia64_ssm(mask)       asm volatile ("ssm %0":: "i"((mask)) : "memory")
70114 +#define __ia64_rsm(mask)       asm volatile ("rsm %0":: "i"((mask)) : "memory")
70115  #define ia64_sum(mask) asm volatile ("sum %0":: "i"((mask)) : "memory")
70116  #define ia64_rum(mask) asm volatile ("rum %0":: "i"((mask)) : "memory")
70117  
70118 -#define ia64_ptce(addr)        asm volatile ("ptc.e %0" :: "r"(addr))
70119 +#define __ia64_ptce(addr)      asm volatile ("ptc.e %0" :: "r"(addr))
70120  
70121 -#define ia64_ptcga(addr, size)                                                 \
70122 +#define __ia64_ptcga(addr, size)                                                       \
70123  do {                                                                           \
70124         asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory");       \
70125         ia64_dv_serialize_data();                                               \
70126  } while (0)
70127  
70128 -#define ia64_ptcl(addr, size)                                                  \
70129 +#define __ia64_ptcl(addr, size)                                                        \
70130  do {                                                                           \
70131         asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory");        \
70132         ia64_dv_serialize_data();                                               \
70133  } while (0)
70134  
70135 -#define ia64_ptri(addr, size)                                          \
70136 +#define __ia64_ptri(addr, size)                                                \
70137         asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
70138  
70139 -#define ia64_ptrd(addr, size)                                          \
70140 +#define __ia64_ptrd(addr, size)                                                \
70141         asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
70142  
70143  /* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
70144 @@ -589,7 +589,7 @@ do {                                                                                \
70145          }                                                              \
70146  })
70147  
70148 -#define ia64_intrin_local_irq_restore(x)                       \
70149 +#define __ia64_intrin_local_irq_restore(x)                     \
70150  do {                                                           \
70151         asm volatile (";;   cmp.ne p6,p7=%0,r0;;"               \
70152                       "(p6) ssm psr.i;"                         \
70153 @@ -598,4 +598,6 @@ do {                                                                \
70154                       :: "r"((x)) : "p6", "p7", "memory");      \
70155  } while (0)
70156  
70157 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
70158 +
70159  #endif /* _ASM_IA64_GCC_INTRIN_H */
70160 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/hypercall.h tmp-linux-2.6-xen.patch/include/asm-ia64/hypercall.h
70161 --- ref-linux-2.6.16.9/include/asm-ia64/hypercall.h     1970-01-01 01:00:00.000000000 +0100
70162 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/hypercall.h        2006-04-10 00:05:52.000000000 +0200
70163 @@ -0,0 +1,274 @@
70164 +/******************************************************************************
70165 + * hypercall.h
70166 + * 
70167 + * Linux-specific hypervisor handling.
70168 + * 
70169 + * Copyright (c) 2002-2004, K A Fraser
70170 + * 
70171 + * This program is free software; you can redistribute it and/or
70172 + * modify it under the terms of the GNU General Public License version 2
70173 + * as published by the Free Software Foundation; or, when distributed
70174 + * separately from the Linux kernel or incorporated into other
70175 + * software packages, subject to the following license:
70176 + * 
70177 + * Permission is hereby granted, free of charge, to any person obtaining a copy
70178 + * of this source file (the "Software"), to deal in the Software without
70179 + * restriction, including without limitation the rights to use, copy, modify,
70180 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
70181 + * and to permit persons to whom the Software is furnished to do so, subject to
70182 + * the following conditions:
70183 + * 
70184 + * The above copyright notice and this permission notice shall be included in
70185 + * all copies or substantial portions of the Software.
70186 + * 
70187 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
70188 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
70189 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
70190 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
70191 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
70192 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
70193 + * IN THE SOFTWARE.
70194 + */
70195 +
70196 +#ifndef __HYPERCALL_H__
70197 +#define __HYPERCALL_H__
70198 +
70199 +#ifndef __HYPERVISOR_H__
70200 +# error "please don't include this file directly"
70201 +#endif
70202 +
70203 +/* FIXME: temp place to hold these page related macros */
70204 +#include <asm/page.h>
70205 +#define virt_to_machine(v) __pa(v)
70206 +#define machine_to_virt(m) __va(m)
70207 +#define virt_to_mfn(v) ((__pa(v)) >> PAGE_SHIFT)
70208 +#define mfn_to_virt(m) (__va((m) << PAGE_SHIFT))
70209 +
70210 +/*
70211 + * Assembler stubs for hyper-calls.
70212 + */
70213 +
70214 +#define _hypercall0(type, name)                                        \
70215 +({                                                             \
70216 +       long __res;                                             \
70217 +       __asm__ __volatile__ (";;\n"                            \
70218 +                             "mov r2=%1\n"                     \
70219 +                             "break 0x1000 ;;\n"               \
70220 +                             "mov %0=r8 ;;\n"                  \
70221 +                             : "=r" (__res)                    \
70222 +                             : "i" (__HYPERVISOR_##name)       \
70223 +                             : "r2","r8",                      \
70224 +                               "memory" );                     \
70225 +       (type)__res;                                            \
70226 +})
70227 +
70228 +#define _hypercall1(type, name, a1)                            \
70229 +({                                                             \
70230 +       long __res;                                             \
70231 +       __asm__ __volatile__ (";;\n"                            \
70232 +                             "mov r14=%2\n"                    \
70233 +                             "mov r2=%1\n"                     \
70234 +                             "break 0x1000 ;;\n"               \
70235 +                             "mov %0=r8 ;;\n"                  \
70236 +                             : "=r" (__res)                    \
70237 +                             : "i" (__HYPERVISOR_##name),      \
70238 +                               "r" ((unsigned long)(a1))       \
70239 +                             : "r14","r2","r8",                \
70240 +                               "memory" );                     \
70241 +       (type)__res;                                            \
70242 +})
70243 +
70244 +#define _hypercall2(type, name, a1, a2)                                \
70245 +({                                                             \
70246 +       long __res;                                             \
70247 +       __asm__ __volatile__ (";;\n"                            \
70248 +                             "mov r14=%2\n"                    \
70249 +                             "mov r15=%3\n"                    \
70250 +                             "mov r2=%1\n"                     \
70251 +                             "break 0x1000 ;;\n"               \
70252 +                             "mov %0=r8 ;;\n"                  \
70253 +                             : "=r" (__res)                    \
70254 +                             : "i" (__HYPERVISOR_##name),      \
70255 +                               "r" ((unsigned long)(a1)),      \
70256 +                               "r" ((unsigned long)(a2))       \
70257 +                             : "r14","r15","r2","r8",          \
70258 +                               "memory" );                     \
70259 +       (type)__res;                                            \
70260 +})
70261 +
70262 +#define _hypercall3(type, name, a1, a2, a3)                    \
70263 +({                                                             \
70264 +       long __res;                                             \
70265 +       __asm__ __volatile__ (";;\n"                            \
70266 +                             "mov r14=%2\n"                    \
70267 +                             "mov r15=%3\n"                    \
70268 +                             "mov r16=%4\n"                    \
70269 +                             "mov r2=%1\n"                     \
70270 +                             "break 0x1000 ;;\n"               \
70271 +                             "mov %0=r8 ;;\n"                  \
70272 +                             : "=r" (__res)                    \
70273 +                             : "i" (__HYPERVISOR_##name),      \
70274 +                               "r" ((unsigned long)(a1)),      \
70275 +                               "r" ((unsigned long)(a2)),      \
70276 +                               "r" ((unsigned long)(a3))       \
70277 +                             : "r14","r15","r16","r2","r8",    \
70278 +                               "memory" );                     \
70279 +       (type)__res;                                            \
70280 +})
70281 +
70282 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
70283 +({                                                             \
70284 +       long __res;                                             \
70285 +       __asm__ __volatile__ (";;\n"                            \
70286 +                             "mov r14=%2\n"                    \
70287 +                             "mov r15=%3\n"                    \
70288 +                             "mov r16=%4\n"                    \
70289 +                             "mov r17=%5\n"                    \
70290 +                             "mov r2=%1\n"                     \
70291 +                             "break 0x1000 ;;\n"               \
70292 +                             "mov %0=r8 ;;\n"                  \
70293 +                             : "=r" (__res)                    \
70294 +                             : "i" (__HYPERVISOR_##name),      \
70295 +                               "r" ((unsigned long)(a1)),      \
70296 +                               "r" ((unsigned long)(a2)),      \
70297 +                               "r" ((unsigned long)(a3)),      \
70298 +                               "r" ((unsigned long)(a4))       \
70299 +                             : "r14","r15","r16","r2","r8",    \
70300 +                               "r17","memory" );               \
70301 +       (type)__res;                                            \
70302 +})
70303 +
70304 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
70305 +({                                                             \
70306 +       long __res;                                             \
70307 +       __asm__ __volatile__ (";;\n"                            \
70308 +                             "mov r14=%2\n"                    \
70309 +                             "mov r15=%3\n"                    \
70310 +                             "mov r16=%4\n"                    \
70311 +                             "mov r17=%5\n"                    \
70312 +                             "mov r18=%6\n"                    \
70313 +                             "mov r2=%1\n"                     \
70314 +                             "break 0x1000 ;;\n"               \
70315 +                             "mov %0=r8 ;;\n"                  \
70316 +                             : "=r" (__res)                    \
70317 +                             : "i" (__HYPERVISOR_##name),      \
70318 +                               "r" ((unsigned long)(a1)),      \
70319 +                               "r" ((unsigned long)(a2)),      \
70320 +                               "r" ((unsigned long)(a3)),      \
70321 +                               "r" ((unsigned long)(a4)),      \
70322 +                               "r" ((unsigned long)(a5))       \
70323 +                             : "r14","r15","r16","r2","r8",    \
70324 +                               "r17","r18","memory" );         \
70325 +       (type)__res;                                            \
70326 +})
70327 +
70328 +static inline int
70329 +HYPERVISOR_sched_op_compat(
70330 +    int cmd, unsigned long arg)
70331 +{
70332 +       return _hypercall2(int, sched_op_compat, cmd, arg);
70333 +}
70334 +
70335 +static inline int
70336 +HYPERVISOR_sched_op(
70337 +       int cmd, void *arg)
70338 +{
70339 +       return _hypercall2(int, sched_op, cmd, arg);
70340 +}
70341 +
70342 +static inline long
70343 +HYPERVISOR_set_timer_op(
70344 +    u64 timeout)
70345 +{
70346 +    unsigned long timeout_hi = (unsigned long)(timeout>>32);
70347 +    unsigned long timeout_lo = (unsigned long)timeout;
70348 +    return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
70349 +}
70350 +
70351 +static inline int
70352 +HYPERVISOR_dom0_op(
70353 +    dom0_op_t *dom0_op)
70354 +{
70355 +    dom0_op->interface_version = DOM0_INTERFACE_VERSION;
70356 +    return _hypercall1(int, dom0_op, dom0_op);
70357 +}
70358 +
70359 +static inline int
70360 +HYPERVISOR_multicall(
70361 +    void *call_list, int nr_calls)
70362 +{
70363 +    return _hypercall2(int, multicall, call_list, nr_calls);
70364 +}
70365 +
70366 +static inline int
70367 +HYPERVISOR_memory_op(
70368 +    unsigned int cmd, void *arg)
70369 +{
70370 +    return _hypercall2(int, memory_op, cmd, arg);
70371 +}
70372 +
70373 +static inline int
70374 +HYPERVISOR_event_channel_op(
70375 +    void *op)
70376 +{
70377 +    return _hypercall1(int, event_channel_op, op);
70378 +}
70379 +
70380 +static inline int
70381 +HYPERVISOR_xen_version(
70382 +    int cmd, void *arg)
70383 +{
70384 +    return _hypercall2(int, xen_version, cmd, arg);
70385 +}
70386 +
70387 +static inline int
70388 +HYPERVISOR_console_io(
70389 +    int cmd, int count, char *str)
70390 +{
70391 +    return _hypercall3(int, console_io, cmd, count, str);
70392 +}
70393 +
70394 +static inline int
70395 +HYPERVISOR_physdev_op(
70396 +    void *physdev_op)
70397 +{
70398 +    return _hypercall1(int, physdev_op, physdev_op);
70399 +}
70400 +
70401 +static inline int
70402 +HYPERVISOR_grant_table_op(
70403 +    unsigned int cmd, void *uop, unsigned int count)
70404 +{
70405 +    return _hypercall3(int, grant_table_op, cmd, uop, count);
70406 +}
70407 +
70408 +static inline int
70409 +HYPERVISOR_vcpu_op(
70410 +       int cmd, int vcpuid, void *extra_args)
70411 +{
70412 +    return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
70413 +}
70414 +
70415 +static inline int
70416 +HYPERVISOR_suspend(
70417 +       unsigned long srec)
70418 +{
70419 +       struct sched_shutdown sched_shutdown = {
70420 +               .reason = SHUTDOWN_suspend
70421 +       };
70422 +
70423 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
70424 +                            &sched_shutdown, srec);
70425 +
70426 +       if (rc == -ENOSYS)
70427 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
70428 +                                SHUTDOWN_suspend, srec);
70429 +
70430 +       return rc;
70431 +}
70432 +
70433 +extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
70434 +static inline void exit_idle(void) {}
70435 +#define do_IRQ(irq, regs) __do_IRQ((irq), (regs))
70436 +
70437 +#endif /* __HYPERCALL_H__ */
70438 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/hypervisor.h tmp-linux-2.6-xen.patch/include/asm-ia64/hypervisor.h
70439 --- ref-linux-2.6.16.9/include/asm-ia64/hypervisor.h    1970-01-01 01:00:00.000000000 +0100
70440 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/hypervisor.h       2006-04-10 00:05:52.000000000 +0200
70441 @@ -0,0 +1,138 @@
70442 +/******************************************************************************
70443 + * hypervisor.h
70444 + * 
70445 + * Linux-specific hypervisor handling.
70446 + * 
70447 + * Copyright (c) 2002-2004, K A Fraser
70448 + * 
70449 + * This program is free software; you can redistribute it and/or
70450 + * modify it under the terms of the GNU General Public License version 2
70451 + * as published by the Free Software Foundation; or, when distributed
70452 + * separately from the Linux kernel or incorporated into other
70453 + * software packages, subject to the following license:
70454 + * 
70455 + * Permission is hereby granted, free of charge, to any person obtaining a copy
70456 + * of this source file (the "Software"), to deal in the Software without
70457 + * restriction, including without limitation the rights to use, copy, modify,
70458 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
70459 + * and to permit persons to whom the Software is furnished to do so, subject to
70460 + * the following conditions:
70461 + * 
70462 + * The above copyright notice and this permission notice shall be included in
70463 + * all copies or substantial portions of the Software.
70464 + * 
70465 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
70466 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
70467 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
70468 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
70469 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
70470 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
70471 + * IN THE SOFTWARE.
70472 + */
70473 +
70474 +#ifndef __HYPERVISOR_H__
70475 +#define __HYPERVISOR_H__
70476 +
70477 +#include <linux/config.h>
70478 +#include <linux/types.h>
70479 +#include <linux/kernel.h>
70480 +#include <linux/version.h>
70481 +#include <linux/errno.h>
70482 +#include <xen/interface/xen.h>
70483 +#include <xen/interface/dom0_ops.h>
70484 +#include <xen/interface/sched.h>
70485 +#include <asm/ptrace.h>
70486 +#include <asm/page.h>
70487 +
70488 +extern shared_info_t *HYPERVISOR_shared_info;
70489 +extern start_info_t *xen_start_info;
70490 +
70491 +void force_evtchn_callback(void);
70492 +
70493 +int xen_init(void);
70494 +
70495 +/* Turn jiffies into Xen system time. XXX Implement me. */
70496 +#define jiffies_to_st(j)       0
70497 +
70498 +#include <asm/hypercall.h>
70499 +
70500 +static inline int
70501 +HYPERVISOR_yield(
70502 +       void)
70503 +{
70504 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
70505 +
70506 +       if (rc == -ENOSYS)
70507 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
70508 +
70509 +       return rc;
70510 +}
70511 +
70512 +static inline int
70513 +HYPERVISOR_block(
70514 +       void)
70515 +{
70516 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
70517 +
70518 +       if (rc == -ENOSYS)
70519 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
70520 +
70521 +       return rc;
70522 +}
70523 +
70524 +static inline int
70525 +HYPERVISOR_shutdown(
70526 +       unsigned int reason)
70527 +{
70528 +       struct sched_shutdown sched_shutdown = {
70529 +               .reason = reason
70530 +       };
70531 +
70532 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
70533 +
70534 +       if (rc == -ENOSYS)
70535 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
70536 +
70537 +       return rc;
70538 +}
70539 +
70540 +static inline int
70541 +HYPERVISOR_poll(
70542 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
70543 +{
70544 +       struct sched_poll sched_poll = {
70545 +               .ports = ports,
70546 +               .nr_ports = nr_ports,
70547 +               .timeout = jiffies_to_st(timeout)
70548 +       };
70549 +
70550 +       int rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
70551 +
70552 +       if (rc == -ENOSYS)
70553 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
70554 +
70555 +       return rc;
70556 +}
70557 +
70558 +// for drivers/xen/privcmd/privcmd.c
70559 +#define direct_remap_pfn_range(a,b,c,d,e,f) remap_pfn_range(a,b,c,d,e)
70560 +#define        pfn_to_mfn(x)   (x)
70561 +#define        mfn_to_pfn(x)   (x)
70562 +#define machine_to_phys_mapping 0
70563 +
70564 +// for drivers/xen/balloon/balloon.c
70565 +#ifdef CONFIG_XEN_SCRUB_PAGES
70566 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
70567 +#else
70568 +#define scrub_pages(_p,_n) ((void)0)
70569 +#endif
70570 +#define        pte_mfn(_x)     pte_pfn(_x)
70571 +#define INVALID_P2M_ENTRY      (~0UL)
70572 +#define __pte_ma(_x)   ((pte_t) {(_x)})
70573 +#define phys_to_machine_mapping_valid(_x)      (1)
70574 +#define        kmap_flush_unused()     do {} while (0)
70575 +#define set_phys_to_machine(_x,_y)     do {} while (0)
70576 +#define xen_machphys_update(_x,_y)     do {} while (0)
70577 +#define pfn_pte_ma(_x,_y)      __pte_ma(0)
70578 +
70579 +#endif /* __HYPERVISOR_H__ */
70580 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/intel_intrin.h tmp-linux-2.6-xen.patch/include/asm-ia64/intel_intrin.h
70581 --- ref-linux-2.6.16.9/include/asm-ia64/intel_intrin.h  2006-04-19 08:10:14.000000000 +0200
70582 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/intel_intrin.h     2006-04-10 00:05:52.000000000 +0200
70583 @@ -119,10 +119,10 @@ __s64 _m64_popcnt(__s64 a);
70584                          * intrinsic
70585                          */
70586  
70587 -#define ia64_getreg            __getReg
70588 -#define ia64_setreg            __setReg
70589 +#define __ia64_getreg          __getReg
70590 +#define __ia64_setreg          __setReg
70591  
70592 -#define ia64_hint(x)
70593 +#define __ia64_hint(x)
70594  
70595  #define ia64_mux1_brcst         0
70596  #define ia64_mux1_mix           8
70597 @@ -135,16 +135,16 @@ __s64 _m64_popcnt(__s64 a);
70598  #define ia64_getf_exp          __getf_exp
70599  #define ia64_shrp              _m64_shrp
70600  
70601 -#define ia64_tpa               __tpa
70602 +#define __ia64_tpa             __tpa
70603  #define ia64_invala            __invala
70604  #define ia64_invala_gr         __invala_gr
70605  #define ia64_invala_fr         __invala_fr
70606  #define ia64_nop               __nop
70607  #define ia64_sum               __sum
70608 -#define ia64_ssm               __ssm
70609 +#define __ia64_ssm             __ssm
70610  #define ia64_rum               __rum
70611 -#define ia64_rsm               __rsm
70612 -#define ia64_fc                __fc
70613 +#define __ia64_rsm             __rsm
70614 +#define __ia64_fc              __fc
70615  
70616  #define ia64_ldfs              __ldfs
70617  #define ia64_ldfd              __ldfd
70618 @@ -182,24 +182,24 @@ __s64 _m64_popcnt(__s64 a);
70619  
70620  #define __ia64_set_dbr(index, val)     \
70621                 __setIndReg(_IA64_REG_INDR_DBR, index, val)
70622 -#define ia64_set_ibr(index, val)       \
70623 +#define __ia64_set_ibr(index, val)     \
70624                 __setIndReg(_IA64_REG_INDR_IBR, index, val)
70625 -#define ia64_set_pkr(index, val)       \
70626 +#define __ia64_set_pkr(index, val)     \
70627                 __setIndReg(_IA64_REG_INDR_PKR, index, val)
70628 -#define ia64_set_pmc(index, val)       \
70629 +#define __ia64_set_pmc(index, val)     \
70630                 __setIndReg(_IA64_REG_INDR_PMC, index, val)
70631 -#define ia64_set_pmd(index, val)       \
70632 +#define __ia64_set_pmd(index, val)     \
70633                 __setIndReg(_IA64_REG_INDR_PMD, index, val)
70634 -#define ia64_set_rr(index, val)        \
70635 +#define __ia64_set_rr(index, val)      \
70636                 __setIndReg(_IA64_REG_INDR_RR, index, val)
70637  
70638 -#define ia64_get_cpuid(index)  __getIndReg(_IA64_REG_INDR_CPUID, index)
70639 +#define __ia64_get_cpuid(index)        __getIndReg(_IA64_REG_INDR_CPUID, index)
70640  #define __ia64_get_dbr(index)  __getIndReg(_IA64_REG_INDR_DBR, index)
70641 -#define ia64_get_ibr(index)    __getIndReg(_IA64_REG_INDR_IBR, index)
70642 -#define ia64_get_pkr(index)    __getIndReg(_IA64_REG_INDR_PKR, index)
70643 -#define ia64_get_pmc(index)    __getIndReg(_IA64_REG_INDR_PMC, index)
70644 -#define ia64_get_pmd(index)    __getIndReg(_IA64_REG_INDR_PMD, index)
70645 -#define ia64_get_rr(index)     __getIndReg(_IA64_REG_INDR_RR, index)
70646 +#define __ia64_get_ibr(index)  __getIndReg(_IA64_REG_INDR_IBR, index)
70647 +#define __ia64_get_pkr(index)  __getIndReg(_IA64_REG_INDR_PKR, index)
70648 +#define __ia64_get_pmc(index)  __getIndReg(_IA64_REG_INDR_PMC, index)
70649 +#define __ia64_get_pmd(index)          __getIndReg(_IA64_REG_INDR_PMD, index)
70650 +#define __ia64_get_rr(index)   __getIndReg(_IA64_REG_INDR_RR, index)
70651  
70652  #define ia64_srlz_d            __dsrlz
70653  #define ia64_srlz_i            __isrlz
70654 @@ -218,18 +218,18 @@ __s64 _m64_popcnt(__s64 a);
70655  #define ia64_ld8_acq           __ld8_acq
70656  
70657  #define ia64_sync_i            __synci
70658 -#define ia64_thash             __thash
70659 -#define ia64_ttag              __ttag
70660 -#define ia64_itcd              __itcd
70661 -#define ia64_itci              __itci
70662 -#define ia64_itrd              __itrd
70663 -#define ia64_itri              __itri
70664 -#define ia64_ptce              __ptce
70665 -#define ia64_ptcl              __ptcl
70666 -#define ia64_ptcg              __ptcg
70667 -#define ia64_ptcga             __ptcga
70668 -#define ia64_ptri              __ptri
70669 -#define ia64_ptrd              __ptrd
70670 +#define __ia64_thash           __thash
70671 +#define __ia64_ttag            __ttag
70672 +#define __ia64_itcd            __itcd
70673 +#define __ia64_itci            __itci
70674 +#define __ia64_itrd            __itrd
70675 +#define __ia64_itri            __itri
70676 +#define __ia64_ptce            __ptce
70677 +#define __ia64_ptcl            __ptcl
70678 +#define __ia64_ptcg            __ptcg
70679 +#define __ia64_ptcga           __ptcga
70680 +#define __ia64_ptri            __ptri
70681 +#define __ia64_ptrd            __ptrd
70682  #define ia64_dep_mi            _m64_dep_mi
70683  
70684  /* Values for lfhint in __lfetch and __lfetch_fault */
70685 @@ -244,14 +244,16 @@ __s64 _m64_popcnt(__s64 a);
70686  #define ia64_lfetch_fault      __lfetch_fault
70687  #define ia64_lfetch_fault_excl __lfetch_fault_excl
70688  
70689 -#define ia64_intrin_local_irq_restore(x)               \
70690 +#define __ia64_intrin_local_irq_restore(x)             \
70691  do {                                                   \
70692         if ((x) != 0) {                                 \
70693 -               ia64_ssm(IA64_PSR_I);                   \
70694 +               __ia64_ssm(IA64_PSR_I);                 \
70695                 ia64_srlz_d();                          \
70696         } else {                                        \
70697 -               ia64_rsm(IA64_PSR_I);                   \
70698 +               __ia64_rsm(IA64_PSR_I);                 \
70699         }                                               \
70700  } while (0)
70701  
70702 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
70703 +
70704  #endif /* _ASM_IA64_INTEL_INTRIN_H */
70705 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/meminit.h tmp-linux-2.6-xen.patch/include/asm-ia64/meminit.h
70706 --- ref-linux-2.6.16.9/include/asm-ia64/meminit.h       2006-04-19 08:10:14.000000000 +0200
70707 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/meminit.h  2006-04-10 00:05:52.000000000 +0200
70708 @@ -17,10 +17,15 @@
70709   *     - command line string
70710   *     - kernel code & data
70711   *     - Kernel memory map built from EFI memory map
70712 + *     - xen start info
70713   *
70714   * More could be added if necessary
70715   */
70716 +#ifndef CONFIG_XEN
70717  #define IA64_MAX_RSVD_REGIONS 6
70718 +#else
70719 +#define IA64_MAX_RSVD_REGIONS 7
70720 +#endif
70721  
70722  struct rsvd_region {
70723         unsigned long start;    /* virtual address of beginning of element */
70724 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/pal.h tmp-linux-2.6-xen.patch/include/asm-ia64/pal.h
70725 --- ref-linux-2.6.16.9/include/asm-ia64/pal.h   2006-04-19 08:10:14.000000000 +0200
70726 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/pal.h      2006-04-10 00:05:52.000000000 +0200
70727 @@ -81,6 +81,7 @@
70728  #ifndef __ASSEMBLY__
70729  
70730  #include <linux/types.h>
70731 +#include <asm/processor.h>
70732  #include <asm/fpu.h>
70733  
70734  /*
70735 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/privop.h tmp-linux-2.6-xen.patch/include/asm-ia64/privop.h
70736 --- ref-linux-2.6.16.9/include/asm-ia64/privop.h        1970-01-01 01:00:00.000000000 +0100
70737 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/privop.h   2006-04-10 00:05:52.000000000 +0200
70738 @@ -0,0 +1,59 @@
70739 +#ifndef _ASM_IA64_PRIVOP_H
70740 +#define _ASM_IA64_PRIVOP_H
70741 +
70742 +/*
70743 + * Copyright (C) 2005 Hewlett-Packard Co
70744 + *     Dan Magenheimer <dan.magenheimer@hp.com>
70745 + *
70746 + */
70747 +
70748 +#include <linux/config.h>
70749 +#ifdef CONFIG_XEN
70750 +#include <asm/xen/privop.h>
70751 +#endif
70752 +
70753 +#ifndef __ASSEMBLY
70754 +
70755 +#ifndef IA64_PARAVIRTUALIZED
70756 +
70757 +#define ia64_getreg                    __ia64_getreg
70758 +#define ia64_setreg                    __ia64_setreg
70759 +#define ia64_hint                      __ia64_hint
70760 +#define ia64_thash                     __ia64_thash
70761 +#define ia64_itci                      __ia64_itci
70762 +#define ia64_itcd                      __ia64_itcd
70763 +#define ia64_itri                      __ia64_itri
70764 +#define ia64_itrd                      __ia64_itrd
70765 +#define ia64_tpa                       __ia64_tpa
70766 +#define ia64_set_ibr                   __ia64_set_ibr
70767 +#define ia64_set_pkr                   __ia64_set_pkr
70768 +#define ia64_set_pmc                   __ia64_set_pmc
70769 +#define ia64_set_pmd                   __ia64_set_pmd
70770 +#define ia64_set_rr                    __ia64_set_rr
70771 +#define ia64_get_cpuid                 __ia64_get_cpuid
70772 +#define ia64_get_ibr                   __ia64_get_ibr
70773 +#define ia64_get_pkr                   __ia64_get_pkr
70774 +#define ia64_get_pmc                   __ia64_get_pmc
70775 +#define ia64_get_pmd                   __ia64_get_pmd
70776 +#define ia64_get_rr                    __ia64_get_rr
70777 +#define ia64_fc                                __ia64_fc
70778 +#define ia64_ssm                       __ia64_ssm
70779 +#define ia64_rsm                       __ia64_rsm
70780 +#define ia64_ptce                      __ia64_ptce
70781 +#define ia64_ptcga                     __ia64_ptcga
70782 +#define ia64_ptcl                      __ia64_ptcl
70783 +#define ia64_ptri                      __ia64_ptri
70784 +#define ia64_ptrd                      __ia64_ptrd
70785 +#define        ia64_get_psr_i                  __ia64_get_psr_i
70786 +#define ia64_intrin_local_irq_restore  __ia64_intrin_local_irq_restore
70787 +#define ia64_pal_halt_light            __ia64_pal_halt_light
70788 +#define        ia64_leave_kernel               __ia64_leave_kernel
70789 +#define        ia64_leave_syscall              __ia64_leave_syscall
70790 +#define        ia64_switch_to                  __ia64_switch_to
70791 +#define        ia64_pal_call_static            __ia64_pal_call_static
70792 +
70793 +#endif /* !IA64_PARAVIRTUALIZED */
70794 +
70795 +#endif /* !__ASSEMBLY */
70796 +
70797 +#endif /* _ASM_IA64_PRIVOP_H */
70798 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/processor.h tmp-linux-2.6-xen.patch/include/asm-ia64/processor.h
70799 --- ref-linux-2.6.16.9/include/asm-ia64/processor.h     2006-04-19 08:10:14.000000000 +0200
70800 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/processor.h        2006-04-10 00:05:52.000000000 +0200
70801 @@ -19,6 +19,7 @@
70802  #include <asm/kregs.h>
70803  #include <asm/ptrace.h>
70804  #include <asm/ustack.h>
70805 +#include <asm/privop.h>
70806  
70807  #define IA64_NUM_DBG_REGS      8
70808  /*
70809 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/synch_bitops.h tmp-linux-2.6-xen.patch/include/asm-ia64/synch_bitops.h
70810 --- ref-linux-2.6.16.9/include/asm-ia64/synch_bitops.h  1970-01-01 01:00:00.000000000 +0100
70811 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/synch_bitops.h     2006-04-10 00:05:52.000000000 +0200
70812 @@ -0,0 +1,61 @@
70813 +#ifndef __XEN_SYNCH_BITOPS_H__
70814 +#define __XEN_SYNCH_BITOPS_H__
70815 +
70816 +/*
70817 + * Copyright 1992, Linus Torvalds.
70818 + * Heavily modified to provide guaranteed strong synchronisation
70819 + * when communicating with Xen or other guest OSes running on other CPUs.
70820 + */
70821 +
70822 +#include <linux/config.h>
70823 +
70824 +#define ADDR (*(volatile long *) addr)
70825 +
70826 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
70827 +{
70828 +       set_bit(nr, addr);
70829 +}
70830 +
70831 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
70832 +{
70833 +       clear_bit(nr, addr);
70834 +}
70835 +
70836 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
70837 +{
70838 +       change_bit(nr, addr);
70839 +}
70840 +
70841 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
70842 +{
70843 +    return test_and_set_bit(nr, addr);
70844 +}
70845 +
70846 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
70847 +{
70848 +    return test_and_clear_bit(nr, addr);
70849 +}
70850 +
70851 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
70852 +{
70853 +    return test_and_change_bit(nr, addr);
70854 +}
70855 +
70856 +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
70857 +{
70858 +    return test_bit(nr, addr);
70859 +}
70860 +
70861 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
70862 +{
70863 +    return test_bit(nr, addr);
70864 +}
70865 +
70866 +#define synch_cmpxchg  ia64_cmpxchg4_acq
70867 +
70868 +#define synch_test_bit(nr,addr) \
70869 +(__builtin_constant_p(nr) ? \
70870 + synch_const_test_bit((nr),(addr)) : \
70871 + synch_var_test_bit((nr),(addr)))
70872 +
70873 +#endif /* __XEN_SYNCH_BITOPS_H__ */
70874 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/system.h tmp-linux-2.6-xen.patch/include/asm-ia64/system.h
70875 --- ref-linux-2.6.16.9/include/asm-ia64/system.h        2006-04-19 08:10:14.000000000 +0200
70876 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/system.h   2006-04-10 00:05:52.000000000 +0200
70877 @@ -125,7 +125,7 @@ extern struct ia64_boot_param {
70878  #define __local_irq_save(x)                    \
70879  do {                                           \
70880         ia64_stop();                            \
70881 -       (x) = ia64_getreg(_IA64_REG_PSR);       \
70882 +       (x) = ia64_get_psr_i();                 \
70883         ia64_stop();                            \
70884         ia64_rsm(IA64_PSR_I);                   \
70885  } while (0)
70886 @@ -173,7 +173,7 @@ do {                                                                \
70887  #endif /* !CONFIG_IA64_DEBUG_IRQ */
70888  
70889  #define local_irq_enable()     ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
70890 -#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
70891 +#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_get_psr_i(); })
70892  
70893  #define irqs_disabled()                                \
70894  ({                                             \
70895 diff -Nurp ref-linux-2.6.16.9/include/asm-ia64/xen/privop.h tmp-linux-2.6-xen.patch/include/asm-ia64/xen/privop.h
70896 --- ref-linux-2.6.16.9/include/asm-ia64/xen/privop.h    1970-01-01 01:00:00.000000000 +0100
70897 +++ tmp-linux-2.6-xen.patch/include/asm-ia64/xen/privop.h       2006-04-10 00:05:52.000000000 +0200
70898 @@ -0,0 +1,277 @@
70899 +#ifndef _ASM_IA64_XEN_PRIVOP_H
70900 +#define _ASM_IA64_XEN_PRIVOP_H
70901 +
70902 +/*
70903 + * Copyright (C) 2005 Hewlett-Packard Co
70904 + *     Dan Magenheimer <dan.magenheimer@hp.com>
70905 + *
70906 + * Paravirtualizations of privileged operations for Xen/ia64
70907 + *
70908 + */
70909 +
70910 +
70911 +#include <asm/xen/asm-xsi-offsets.h>
70912 +
70913 +#define IA64_PARAVIRTUALIZED
70914 +
70915 +#ifdef __ASSEMBLY__
70916 +#define        XEN_HYPER_RFI                   break 0x1
70917 +#define        XEN_HYPER_RSM_PSR_DT            break 0x2
70918 +#define        XEN_HYPER_SSM_PSR_DT            break 0x3
70919 +#define        XEN_HYPER_COVER                 break 0x4
70920 +#define        XEN_HYPER_ITC_D                 break 0x5
70921 +#define        XEN_HYPER_ITC_I                 break 0x6
70922 +#define        XEN_HYPER_SSM_I                 break 0x7
70923 +#define        XEN_HYPER_GET_IVR               break 0x8
70924 +#define        XEN_HYPER_GET_TPR               break 0x9
70925 +#define        XEN_HYPER_SET_TPR               break 0xa
70926 +#define        XEN_HYPER_EOI                   break 0xb
70927 +#define        XEN_HYPER_SET_ITM               break 0xc
70928 +#define        XEN_HYPER_THASH                 break 0xd
70929 +#define        XEN_HYPER_PTC_GA                break 0xe
70930 +#define        XEN_HYPER_ITR_D                 break 0xf
70931 +#define        XEN_HYPER_GET_RR                break 0x10
70932 +#define        XEN_HYPER_SET_RR                break 0x11
70933 +#define        XEN_HYPER_SET_KR                break 0x12
70934 +#define        XEN_HYPER_FC                    break 0x13
70935 +#define        XEN_HYPER_GET_CPUID             break 0x14
70936 +#define        XEN_HYPER_GET_PMD               break 0x15
70937 +#define        XEN_HYPER_GET_EFLAG             break 0x16
70938 +#define        XEN_HYPER_SET_EFLAG             break 0x17
70939 +#endif
70940 +
70941 +#ifndef __ASSEMBLY__
70942 +#ifdef MODULE
70943 +extern int is_running_on_xen(void);
70944 +#define running_on_xen (is_running_on_xen())
70945 +#else
70946 +extern int running_on_xen;
70947 +#endif
70948 +
70949 +#define        XEN_HYPER_SSM_I                 asm("break 0x7");
70950 +#define        XEN_HYPER_GET_IVR               asm("break 0x8");
70951 +
70952 +/************************************************/
70953 +/* Instructions paravirtualized for correctness */
70954 +/************************************************/
70955 +
70956 +/* "fc" and "thash" are privilege-sensitive instructions, meaning they
70957 + *  may have different semantics depending on whether they are executed
70958 + *  at PL0 vs PL!=0.  When paravirtualized, these instructions mustn't
70959 + *  be allowed to execute directly, lest incorrect semantics result. */
70960 +extern unsigned long xen_fc(unsigned long addr);
70961 +#define ia64_fc(addr)                  xen_fc((unsigned long)(addr))
70962 +extern unsigned long xen_thash(unsigned long addr);
70963 +#define ia64_thash(addr)               xen_thash((unsigned long)(addr))
70964 +/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
70965 + * is not currently used (though it may be in a long-format VHPT system!)
70966 + * and the semantics of cover only change if psr.ic is off which is very
70967 + * rare (and currently non-existent outside of assembly code */
70968 +
70969 +/* There are also privilege-sensitive registers.  These registers are
70970 + * readable at any privilege level but only writable at PL0. */
70971 +extern unsigned long xen_get_cpuid(int index);
70972 +#define        ia64_get_cpuid(i)               xen_get_cpuid(i)
70973 +extern unsigned long xen_get_pmd(int index);
70974 +#define        ia64_get_pmd(i)                 xen_get_pmd(i)
70975 +extern unsigned long xen_get_eflag(void);      /* see xen_ia64_getreg */
70976 +extern void xen_set_eflag(unsigned long);      /* see xen_ia64_setreg */
70977 +
70978 +/************************************************/
70979 +/* Instructions paravirtualized for performance */
70980 +/************************************************/
70981 +
70982 +/* Xen uses memory-mapped virtual privileged registers for access to many
70983 + * performance-sensitive privileged registers.  Some, like the processor
70984 + * status register (psr), are broken up into multiple memory locations.
70985 + * Others, like "pend", are abstractions based on privileged registers.
70986 + * "Pend" is guaranteed to be set if reading cr.ivr would return a
70987 + * (non-spurious) interrupt. */
70988 +#define xen_get_virtual_psr_i()                (*(int *)(XSI_PSR_I))
70989 +#define xen_set_virtual_psr_i(_val)    ({ *(int *)(XSI_PSR_I) = _val ? 1:0; })
70990 +#define xen_set_virtual_psr_ic(_val)   ({ *(int *)(XSI_PSR_IC) = _val ? 1:0; })
70991 +#define xen_get_virtual_pend()         (*(int *)(XSI_PEND))
70992 +
70993 +/* Hyperprivops are "break" instructions with a well-defined API.
70994 + * In particular, the virtual psr.ic bit must be off; in this way
70995 + * it is guaranteed to never conflict with a linux break instruction.
70996 + * Normally, this is done in a xen stub but this one is frequent enough
70997 + * that we inline it */
70998 +#define xen_hyper_ssm_i()                                              \
70999 +({                                                                     \
71000 +       xen_set_virtual_psr_i(0);                                       \
71001 +       xen_set_virtual_psr_ic(0);                                      \
71002 +       XEN_HYPER_SSM_I;                                                \
71003 +})
71004 +
71005 +/* turning off interrupts can be paravirtualized simply by writing
71006 + * to a memory-mapped virtual psr.i bit (implemented as a 16-bit bool) */
71007 +#define xen_rsm_i()    xen_set_virtual_psr_i(0)
71008 +
71009 +/* turning on interrupts is a bit more complicated.. write to the
71010 + * memory-mapped virtual psr.i bit first (to avoid race condition),
71011 + * then if any interrupts were pending, we have to execute a hyperprivop
71012 + * to ensure the pending interrupt gets delivered; else we're done! */
71013 +#define xen_ssm_i()                                                    \
71014 +({                                                                     \
71015 +       int old = xen_get_virtual_psr_i();                              \
71016 +       xen_set_virtual_psr_i(1);                                       \
71017 +       if (!old && xen_get_virtual_pend()) xen_hyper_ssm_i();          \
71018 +})
71019 +
71020 +#define xen_ia64_intrin_local_irq_restore(x)                           \
71021 +{                                                                      \
71022 +     if (running_on_xen) {                                             \
71023 +       if ((x) & IA64_PSR_I) { xen_ssm_i(); }                          \
71024 +       else { xen_rsm_i(); }                                           \
71025 +    }                                                                  \
71026 +    else __ia64_intrin_local_irq_restore((x));                         \
71027 +}
71028 +
71029 +#define        xen_get_psr_i()                                                 \
71030 +(                                                                      \
71031 +       (running_on_xen) ?                                              \
71032 +               (xen_get_virtual_psr_i() ? IA64_PSR_I : 0)              \
71033 +               : __ia64_get_psr_i()                                    \
71034 +)
71035 +
71036 +#define xen_ia64_ssm(mask)                                             \
71037 +{                                                                      \
71038 +       if ((mask)==IA64_PSR_I) {                                       \
71039 +               if (running_on_xen) { xen_ssm_i(); }                    \
71040 +               else { __ia64_ssm(mask); }                              \
71041 +       }                                                               \
71042 +       else { __ia64_ssm(mask); }                                      \
71043 +}
71044 +
71045 +#define xen_ia64_rsm(mask)                                             \
71046 +{                                                                      \
71047 +       if ((mask)==IA64_PSR_I) {                                       \
71048 +               if (running_on_xen) { xen_rsm_i(); }                    \
71049 +               else { __ia64_rsm(mask); }                              \
71050 +       }                                                               \
71051 +       else { __ia64_rsm(mask); }                                      \
71052 +}
71053 +
71054 +
71055 +/* Although all privileged operations can be left to trap and will
71056 + * be properly handled by Xen, some are frequent enough that we use
71057 + * hyperprivops for performance. */
71058 +
71059 +extern unsigned long xen_get_ivr(void);
71060 +extern unsigned long xen_get_tpr(void);
71061 +extern void xen_set_itm(unsigned long);
71062 +extern void xen_set_tpr(unsigned long);
71063 +extern void xen_eoi(void);
71064 +extern void xen_set_rr(unsigned long index, unsigned long val);
71065 +extern unsigned long xen_get_rr(unsigned long index);
71066 +extern void xen_set_kr(unsigned long index, unsigned long val);
71067 +
71068 +/* Note: It may look wrong to test for running_on_xen in each case.
71069 + * However regnum is always a constant so, as written, the compiler
71070 + * eliminates the switch statement, whereas running_on_xen must be
71071 + * tested dynamically. */
71072 +#define xen_ia64_getreg(regnum)                                                \
71073 +({                                                                     \
71074 +       __u64 ia64_intri_res;                                           \
71075 +                                                                       \
71076 +       switch(regnum) {                                                \
71077 +       case _IA64_REG_CR_IVR:                                          \
71078 +               ia64_intri_res = (running_on_xen) ?                     \
71079 +                       xen_get_ivr() :                                 \
71080 +                       __ia64_getreg(regnum);                          \
71081 +               break;                                                  \
71082 +       case _IA64_REG_CR_TPR:                                          \
71083 +               ia64_intri_res = (running_on_xen) ?                     \
71084 +                       xen_get_tpr() :                                 \
71085 +                       __ia64_getreg(regnum);                          \
71086 +               break;                                                  \
71087 +       case _IA64_REG_AR_EFLAG:                                        \
71088 +               ia64_intri_res = (running_on_xen) ?                     \
71089 +                       xen_get_eflag() :                               \
71090 +                       __ia64_getreg(regnum);                          \
71091 +               break;                                                  \
71092 +       default:                                                        \
71093 +               ia64_intri_res = __ia64_getreg(regnum);                 \
71094 +               break;                                                  \
71095 +       }                                                               \
71096 +       ia64_intri_res;                                                 \
71097 +})
71098 +
71099 +#define xen_ia64_setreg(regnum,val)                                    \
71100 +({                                                                     \
71101 +       switch(regnum) {                                                \
71102 +       case _IA64_REG_AR_KR0 ... _IA64_REG_AR_KR7:                     \
71103 +               (running_on_xen) ?                                      \
71104 +                       xen_set_kr((regnum-_IA64_REG_AR_KR0), val) :    \
71105 +                       __ia64_setreg(regnum,val);                      \
71106 +               break;                                                  \
71107 +       case _IA64_REG_CR_ITM:                                          \
71108 +               (running_on_xen) ?                                      \
71109 +                       xen_set_itm(val) :                              \
71110 +                       __ia64_setreg(regnum,val);                      \
71111 +               break;                                                  \
71112 +       case _IA64_REG_CR_TPR:                                          \
71113 +               (running_on_xen) ?                                      \
71114 +                       xen_set_tpr(val) :                              \
71115 +                       __ia64_setreg(regnum,val);                      \
71116 +               break;                                                  \
71117 +       case _IA64_REG_CR_EOI:                                          \
71118 +               (running_on_xen) ?                                      \
71119 +                       xen_eoi() :                                     \
71120 +                       __ia64_setreg(regnum,val);                      \
71121 +               break;                                                  \
71122 +       case _IA64_REG_AR_EFLAG:                                        \
71123 +               (running_on_xen) ?                                      \
71124 +                       xen_set_eflag(val) :                            \
71125 +                       __ia64_setreg(regnum,val);                      \
71126 +               break;                                                  \
71127 +       default:                                                        \
71128 +               __ia64_setreg(regnum,val);                              \
71129 +               break;                                                  \
71130 +       }                                                               \
71131 +})
71132 +
71133 +#define ia64_ssm                       xen_ia64_ssm
71134 +#define ia64_rsm                       xen_ia64_rsm
71135 +#define ia64_intrin_local_irq_restore  xen_ia64_intrin_local_irq_restore
71136 +#define        ia64_ptcga                      xen_ptcga
71137 +#define        ia64_set_rr(index,val)          xen_set_rr(index,val)
71138 +#define        ia64_get_rr(index)              xen_get_rr(index)
71139 +#define ia64_getreg                    xen_ia64_getreg
71140 +#define ia64_setreg                    xen_ia64_setreg
71141 +#define        ia64_get_psr_i                  xen_get_psr_i
71142 +
71143 +/* the remainder of these are not performance-sensitive so its
71144 + * OK to not paravirtualize and just take a privop trap and emulate */
71145 +#define ia64_hint                      __ia64_hint
71146 +#define ia64_set_pmd                   __ia64_set_pmd
71147 +#define ia64_itci                      __ia64_itci
71148 +#define ia64_itcd                      __ia64_itcd
71149 +#define ia64_itri                      __ia64_itri
71150 +#define ia64_itrd                      __ia64_itrd
71151 +#define ia64_tpa                       __ia64_tpa
71152 +#define ia64_set_ibr                   __ia64_set_ibr
71153 +#define ia64_set_pkr                   __ia64_set_pkr
71154 +#define ia64_set_pmc                   __ia64_set_pmc
71155 +#define ia64_get_ibr                   __ia64_get_ibr
71156 +#define ia64_get_pkr                   __ia64_get_pkr
71157 +#define ia64_get_pmc                   __ia64_get_pmc
71158 +#define ia64_ptce                      __ia64_ptce
71159 +#define ia64_ptcl                      __ia64_ptcl
71160 +#define ia64_ptri                      __ia64_ptri
71161 +#define ia64_ptrd                      __ia64_ptrd
71162 +
71163 +#endif /* !__ASSEMBLY__ */
71164 +
71165 +/* these routines utilize privilege-sensitive or performance-sensitive
71166 + * privileged instructions so the code must be replaced with
71167 + * paravirtualized versions */
71168 +#define ia64_pal_halt_light            xen_pal_halt_light
71169 +#define        ia64_leave_kernel               xen_leave_kernel
71170 +#define        ia64_leave_syscall              xen_leave_syscall
71171 +#define        ia64_trace_syscall              xen_trace_syscall
71172 +#define        ia64_switch_to                  xen_switch_to
71173 +#define        ia64_pal_call_static            xen_pal_call_static
71174 +
71175 +#endif /* _ASM_IA64_XEN_PRIVOP_H */
71176 diff -Nurp ref-linux-2.6.16.9/include/asm-um/page.h tmp-linux-2.6-xen.patch/include/asm-um/page.h
71177 --- ref-linux-2.6.16.9/include/asm-um/page.h    2006-04-19 08:10:14.000000000 +0200
71178 +++ tmp-linux-2.6-xen.patch/include/asm-um/page.h       2006-04-10 00:05:52.000000000 +0200
71179 @@ -118,7 +118,7 @@ extern unsigned long uml_physmem;
71180  extern struct page *arch_validate(struct page *page, gfp_t mask, int order);
71181  #define HAVE_ARCH_VALIDATE
71182  
71183 -extern void arch_free_page(struct page *page, int order);
71184 +extern int arch_free_page(struct page *page, int order);
71185  #define HAVE_ARCH_FREE_PAGE
71186  
71187  #include <asm-generic/page.h>
71188 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/apic.h tmp-linux-2.6-xen.patch/include/asm-x86_64/apic.h
71189 --- ref-linux-2.6.16.9/include/asm-x86_64/apic.h        2006-04-19 08:10:14.000000000 +0200
71190 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/apic.h   2006-04-10 00:05:52.000000000 +0200
71191 @@ -105,11 +105,13 @@ extern int disable_timer_pin_1;
71192  
71193  extern void setup_threshold_lvt(unsigned long lvt_off);
71194  
71195 +#ifndef CONFIG_XEN
71196  void smp_send_timer_broadcast_ipi(void);
71197  void switch_APIC_timer_to_ipi(void *cpumask);
71198  void switch_ipi_to_APIC_timer(void *cpumask);
71199  
71200  #define ARCH_APICTIMER_STOPS_ON_C3     1
71201 +#endif
71202  
71203  #endif /* CONFIG_X86_LOCAL_APIC */
71204  
71205 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/arch_hooks.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/arch_hooks.h
71206 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/arch_hooks.h     1970-01-01 01:00:00.000000000 +0100
71207 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/arch_hooks.h        2006-04-10 00:05:52.000000000 +0200
71208 @@ -0,0 +1,27 @@
71209 +#ifndef _ASM_ARCH_HOOKS_H
71210 +#define _ASM_ARCH_HOOKS_H
71211 +
71212 +#include <linux/interrupt.h>
71213 +
71214 +/*
71215 + *     linux/include/asm/arch_hooks.h
71216 + *
71217 + *     define the architecture specific hooks 
71218 + */
71219 +
71220 +/* these aren't arch hooks, they are generic routines
71221 + * that can be used by the hooks */
71222 +extern void init_ISA_irqs(void);
71223 +extern void apic_intr_init(void);
71224 +extern void smp_intr_init(void);
71225 +extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
71226 +
71227 +/* these are the defined hooks */
71228 +extern void intr_init_hook(void);
71229 +extern void pre_intr_init_hook(void);
71230 +extern void pre_setup_arch_hook(void);
71231 +extern void trap_init_hook(void);
71232 +extern void time_init_hook(void);
71233 +extern void mca_nmi_hook(void);
71234 +
71235 +#endif
71236 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/bootsetup.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/bootsetup.h
71237 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/bootsetup.h      1970-01-01 01:00:00.000000000 +0100
71238 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/bootsetup.h 2006-04-10 00:05:52.000000000 +0200
71239 @@ -0,0 +1,42 @@
71240 +
71241 +#ifndef _X86_64_BOOTSETUP_H
71242 +#define _X86_64_BOOTSETUP_H 1
71243 +
71244 +#define BOOT_PARAM_SIZE                4096
71245 +extern char x86_boot_params[BOOT_PARAM_SIZE];
71246 +
71247 +/*
71248 + * This is set up by the setup-routine at boot-time
71249 + */
71250 +#define PARAM  ((unsigned char *)x86_boot_params)
71251 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
71252 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
71253 +#define ALT_MEM_K (*(unsigned int *) (PARAM+0x1e0))
71254 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
71255 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
71256 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
71257 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
71258 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
71259 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
71260 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
71261 +#define SAVED_VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
71262 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
71263 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
71264 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
71265 +#define KERNEL_START (*(unsigned int *) (PARAM+0x214))
71266 +
71267 +#define INITRD_START (__pa(xen_start_info->mod_start))
71268 +#define INITRD_SIZE (xen_start_info->mod_len)
71269 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
71270 +
71271 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
71272 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
71273 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
71274 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
71275 +#define COMMAND_LINE saved_command_line
71276 +
71277 +#define RAMDISK_IMAGE_START_MASK       0x07FF
71278 +#define RAMDISK_PROMPT_FLAG            0x8000
71279 +#define RAMDISK_LOAD_FLAG              0x4000  
71280 +
71281 +#endif
71282 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/desc.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/desc.h
71283 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/desc.h   1970-01-01 01:00:00.000000000 +0100
71284 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/desc.h      2006-04-10 00:05:52.000000000 +0200
71285 @@ -0,0 +1,263 @@
71286 +/* Written 2000 by Andi Kleen */ 
71287 +#ifndef __ARCH_DESC_H
71288 +#define __ARCH_DESC_H
71289 +
71290 +#include <linux/threads.h>
71291 +#include <asm/ldt.h>
71292 +
71293 +#ifndef __ASSEMBLY__
71294 +
71295 +#include <linux/string.h>
71296 +#include <linux/smp.h>
71297 +
71298 +#include <asm/segment.h>
71299 +#include <asm/mmu.h>
71300 +
71301 +// 8 byte segment descriptor
71302 +struct desc_struct { 
71303 +       u16 limit0;
71304 +       u16 base0;
71305 +       unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
71306 +       unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
71307 +} __attribute__((packed)); 
71308 +
71309 +struct n_desc_struct { 
71310 +       unsigned int a,b;
71311 +};     
71312 +
71313 +enum { 
71314 +       GATE_INTERRUPT = 0xE, 
71315 +       GATE_TRAP = 0xF,        
71316 +       GATE_CALL = 0xC,
71317 +};     
71318 +
71319 +// 16byte gate
71320 +struct gate_struct {          
71321 +       u16 offset_low;
71322 +       u16 segment; 
71323 +       unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
71324 +       u16 offset_middle;
71325 +       u32 offset_high;
71326 +       u32 zero1; 
71327 +} __attribute__((packed));
71328 +
71329 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) 
71330 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
71331 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
71332 +
71333 +enum { 
71334 +       DESC_TSS = 0x9,
71335 +       DESC_LDT = 0x2,
71336 +}; 
71337 +
71338 +// LDT or TSS descriptor in the GDT. 16 bytes.
71339 +struct ldttss_desc { 
71340 +       u16 limit0;
71341 +       u16 base0;
71342 +       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
71343 +       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
71344 +       u32 base3;
71345 +       u32 zero1; 
71346 +} __attribute__((packed)); 
71347 +
71348 +struct desc_ptr {
71349 +       unsigned short size;
71350 +       unsigned long address;
71351 +} __attribute__((packed)) ;
71352 +
71353 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
71354 +
71355 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
71356 +
71357 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
71358 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
71359 +
71360 +static inline void clear_LDT(void)
71361 +{
71362 +       int cpu = get_cpu();
71363 +
71364 +       /*
71365 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
71366 +        * it slows down context switching. Noone uses it anyway.
71367 +        */
71368 +       cpu = cpu;              /* XXX avoid compiler warning */
71369 +       xen_set_ldt(0UL, 0);
71370 +       put_cpu();
71371 +}
71372 +
71373 +/*
71374 + * This is the ldt that every process will get unless we need
71375 + * something other than this.
71376 + */
71377 +extern struct desc_struct default_ldt[];
71378 +#ifndef CONFIG_X86_NO_IDT
71379 +extern struct gate_struct idt_table[]; 
71380 +#endif
71381 +extern struct desc_ptr cpu_gdt_descr[];
71382 +
71383 +/* the cpu gdt accessor */
71384 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
71385 +
71386 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)  
71387 +{
71388 +       struct gate_struct s;   
71389 +       s.offset_low = PTR_LOW(func); 
71390 +       s.segment = __KERNEL_CS;
71391 +       s.ist = ist; 
71392 +       s.p = 1;
71393 +       s.dpl = dpl; 
71394 +       s.zero0 = 0;
71395 +       s.zero1 = 0; 
71396 +       s.type = type; 
71397 +       s.offset_middle = PTR_MIDDLE(func); 
71398 +       s.offset_high = PTR_HIGH(func); 
71399 +       /* does not need to be atomic because it is only done once at setup time */ 
71400 +       memcpy(adr, &s, 16); 
71401 +} 
71402 +
71403 +#ifndef CONFIG_X86_NO_IDT
71404 +static inline void set_intr_gate(int nr, void *func) 
71405 +{ 
71406 +       BUG_ON((unsigned)nr > 0xFF);
71407 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
71408 +} 
71409 +
71410 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
71411 +{ 
71412 +       BUG_ON((unsigned)nr > 0xFF);
71413 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
71414 +} 
71415 +
71416 +static inline void set_system_gate(int nr, void *func) 
71417 +{ 
71418 +       BUG_ON((unsigned)nr > 0xFF);
71419 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
71420 +} 
71421 +
71422 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
71423 +{
71424 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
71425 +}
71426 +#endif
71427 +
71428 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, 
71429 +                                        unsigned size) 
71430 +{ 
71431 +       struct ldttss_desc d;
71432 +       memset(&d,0,sizeof(d)); 
71433 +       d.limit0 = size & 0xFFFF;
71434 +       d.base0 = PTR_LOW(tss); 
71435 +       d.base1 = PTR_MIDDLE(tss) & 0xFF; 
71436 +       d.type = type;
71437 +       d.p = 1; 
71438 +       d.limit1 = (size >> 16) & 0xF;
71439 +       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; 
71440 +       d.base3 = PTR_HIGH(tss); 
71441 +       memcpy(ptr, &d, 16); 
71442 +}
71443 +
71444 +#ifndef CONFIG_X86_NO_TSS
71445 +static inline void set_tss_desc(unsigned cpu, void *addr)
71446 +{ 
71447 +       /*
71448 +        * sizeof(unsigned long) coming from an extra "long" at the end
71449 +        * of the iobitmap. See tss_struct definition in processor.h
71450 +        *
71451 +        * -1? seg base+limit should be pointing to the address of the
71452 +        * last valid byte
71453 +        */
71454 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS], 
71455 +               (unsigned long)addr, DESC_TSS,
71456 +               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
71457 +} 
71458 +#endif
71459 +
71460 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
71461 +{ 
71462 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
71463 +                             DESC_LDT, size * 8 - 1);
71464 +}
71465 +
71466 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
71467 +{ 
71468 +       struct desc_struct *d = &cpu_gdt(cpu)[entry];
71469 +       u32 addr = (u32)(u64)base;
71470 +       BUG_ON((u64)base >> 32); 
71471 +       d->base0 = addr & 0xffff;
71472 +       d->base1 = (addr >> 16) & 0xff;
71473 +       d->base2 = (addr >> 24) & 0xff;
71474 +} 
71475 +
71476 +#define LDT_entry_a(info) \
71477 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
71478 +/* Don't allow setting of the lm bit. It is useless anyways because 
71479 +   64bit system calls require __USER_CS. */ 
71480 +#define LDT_entry_b(info) \
71481 +       (((info)->base_addr & 0xff000000) | \
71482 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
71483 +       ((info)->limit & 0xf0000) | \
71484 +       (((info)->read_exec_only ^ 1) << 9) | \
71485 +       ((info)->contents << 10) | \
71486 +       (((info)->seg_not_present ^ 1) << 15) | \
71487 +       ((info)->seg_32bit << 22) | \
71488 +       ((info)->limit_in_pages << 23) | \
71489 +       ((info)->useable << 20) | \
71490 +       /* ((info)->lm << 21) | */ \
71491 +       0x7000)
71492 +
71493 +#define LDT_empty(info) (\
71494 +       (info)->base_addr       == 0    && \
71495 +       (info)->limit           == 0    && \
71496 +       (info)->contents        == 0    && \
71497 +       (info)->read_exec_only  == 1    && \
71498 +       (info)->seg_32bit       == 0    && \
71499 +       (info)->limit_in_pages  == 0    && \
71500 +       (info)->seg_not_present == 1    && \
71501 +       (info)->useable         == 0    && \
71502 +       (info)->lm              == 0)
71503 +
71504 +#if TLS_SIZE != 24
71505 +# error update this code.
71506 +#endif
71507 +
71508 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
71509 +{
71510 +#if 0
71511 +       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
71512 +       gdt[0] = t->tls_array[0];
71513 +       gdt[1] = t->tls_array[1];
71514 +       gdt[2] = t->tls_array[2];
71515 +#endif
71516 +#define C(i) \
71517 +       HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i])
71518 +
71519 +       C(0); C(1); C(2);
71520 +#undef C
71521 +} 
71522 +
71523 +/*
71524 + * load one particular LDT into the current CPU
71525 + */
71526 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
71527 +{
71528 +       void *segments = pc->ldt;
71529 +       int count = pc->size;
71530 +
71531 +       if (likely(!count))
71532 +               segments = NULL;
71533 +
71534 +       xen_set_ldt((unsigned long)segments, count);
71535 +}
71536 +
71537 +static inline void load_LDT(mm_context_t *pc)
71538 +{
71539 +       int cpu = get_cpu();
71540 +       load_LDT_nolock(pc, cpu);
71541 +       put_cpu();
71542 +}
71543 +
71544 +extern struct desc_ptr idt_descr;
71545 +
71546 +#endif /* !__ASSEMBLY__ */
71547 +
71548 +#endif
71549 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/dma-mapping.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/dma-mapping.h
71550 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/dma-mapping.h    1970-01-01 01:00:00.000000000 +0100
71551 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/dma-mapping.h       2006-04-10 00:05:52.000000000 +0200
71552 @@ -0,0 +1,191 @@
71553 +#ifndef _X8664_DMA_MAPPING_H
71554 +#define _X8664_DMA_MAPPING_H 1
71555 +
71556 +/*
71557 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
71558 + * documentation.
71559 + */
71560 +
71561 +#include <linux/config.h>
71562 +
71563 +#include <asm/scatterlist.h>
71564 +#include <asm/io.h>
71565 +#include <asm/swiotlb.h>
71566 +
71567 +struct dma_mapping_ops {
71568 +       int             (*mapping_error)(dma_addr_t dma_addr);
71569 +       void*           (*alloc_coherent)(struct device *dev, size_t size,
71570 +                                dma_addr_t *dma_handle, gfp_t gfp);
71571 +       void            (*free_coherent)(struct device *dev, size_t size,
71572 +                                void *vaddr, dma_addr_t dma_handle);
71573 +       dma_addr_t      (*map_single)(struct device *hwdev, void *ptr,
71574 +                                size_t size, int direction);
71575 +       /* like map_single, but doesn't check the device mask */
71576 +       dma_addr_t      (*map_simple)(struct device *hwdev, char *ptr,
71577 +                                size_t size, int direction);
71578 +       void            (*unmap_single)(struct device *dev, dma_addr_t addr,
71579 +                               size_t size, int direction);
71580 +       void            (*sync_single_for_cpu)(struct device *hwdev,
71581 +                               dma_addr_t dma_handle, size_t size,
71582 +                               int direction);
71583 +       void            (*sync_single_for_device)(struct device *hwdev,
71584 +                                dma_addr_t dma_handle, size_t size,
71585 +                               int direction);
71586 +       void            (*sync_single_range_for_cpu)(struct device *hwdev,
71587 +                                dma_addr_t dma_handle, unsigned long offset,
71588 +                               size_t size, int direction);
71589 +       void            (*sync_single_range_for_device)(struct device *hwdev,
71590 +                               dma_addr_t dma_handle, unsigned long offset,
71591 +                               size_t size, int direction);
71592 +       void            (*sync_sg_for_cpu)(struct device *hwdev,
71593 +                                struct scatterlist *sg, int nelems,
71594 +                               int direction);
71595 +       void            (*sync_sg_for_device)(struct device *hwdev,
71596 +                               struct scatterlist *sg, int nelems,
71597 +                               int direction);
71598 +       int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
71599 +                               int nents, int direction);
71600 +       void            (*unmap_sg)(struct device *hwdev,
71601 +                               struct scatterlist *sg, int nents,
71602 +                               int direction);
71603 +       int             (*dma_supported)(struct device *hwdev, u64 mask);
71604 +       int             is_phys;
71605 +};
71606 +
71607 +extern dma_addr_t bad_dma_address;
71608 +extern struct dma_mapping_ops* dma_ops;
71609 +extern int iommu_merge;
71610 +
71611 +#if 0
71612 +static inline int dma_mapping_error(dma_addr_t dma_addr)
71613 +{
71614 +       if (dma_ops->mapping_error)
71615 +               return dma_ops->mapping_error(dma_addr);
71616 +
71617 +       return (dma_addr == bad_dma_address);
71618 +}
71619 +
71620 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
71621 +                               dma_addr_t *dma_handle, gfp_t gfp);
71622 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
71623 +                             dma_addr_t dma_handle);
71624 +
71625 +static inline dma_addr_t
71626 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
71627 +              int direction)
71628 +{
71629 +       return dma_ops->map_single(hwdev, ptr, size, direction);
71630 +}
71631 +
71632 +static inline void
71633 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
71634 +                int direction)
71635 +{
71636 +       dma_ops->unmap_single(dev, addr, size, direction);
71637 +}
71638 +
71639 +#define dma_map_page(dev,page,offset,size,dir) \
71640 +       dma_map_single((dev), page_address(page)+(offset), (size), (dir))
71641 +
71642 +#define dma_unmap_page dma_unmap_single
71643 +
71644 +static inline void
71645 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
71646 +                       size_t size, int direction)
71647 +{
71648 +       if (dma_ops->sync_single_for_cpu)
71649 +               dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
71650 +                                            direction);
71651 +       flush_write_buffers();
71652 +}
71653 +
71654 +static inline void
71655 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
71656 +                          size_t size, int direction)
71657 +{
71658 +       if (dma_ops->sync_single_for_device)
71659 +               dma_ops->sync_single_for_device(hwdev, dma_handle, size,
71660 +                                               direction);
71661 +       flush_write_buffers();
71662 +}
71663 +
71664 +static inline void
71665 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
71666 +                             unsigned long offset, size_t size, int direction)
71667 +{
71668 +       if (dma_ops->sync_single_range_for_cpu) {
71669 +               dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
71670 +       }
71671 +
71672 +       flush_write_buffers();
71673 +}
71674 +
71675 +static inline void
71676 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
71677 +                                unsigned long offset, size_t size, int direction)
71678 +{
71679 +       if (dma_ops->sync_single_range_for_device)
71680 +               dma_ops->sync_single_range_for_device(hwdev, dma_handle,
71681 +                                                     offset, size, direction);
71682 +
71683 +       flush_write_buffers();
71684 +}
71685 +
71686 +static inline void
71687 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
71688 +                   int nelems, int direction)
71689 +{
71690 +       if (dma_ops->sync_sg_for_cpu)
71691 +               dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
71692 +       flush_write_buffers();
71693 +}
71694 +
71695 +static inline void
71696 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
71697 +                      int nelems, int direction)
71698 +{
71699 +       if (dma_ops->sync_sg_for_device) {
71700 +               dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
71701 +       }
71702 +
71703 +       flush_write_buffers();
71704 +}
71705 +
71706 +static inline int
71707 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
71708 +{
71709 +       return dma_ops->map_sg(hwdev, sg, nents, direction);
71710 +}
71711 +
71712 +static inline void
71713 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
71714 +            int direction)
71715 +{
71716 +       dma_ops->unmap_sg(hwdev, sg, nents, direction);
71717 +}
71718 +
71719 +extern int dma_supported(struct device *hwdev, u64 mask);
71720 +
71721 +/* same for gart, swiotlb, and nommu */
71722 +static inline int dma_get_cache_alignment(void)
71723 +{
71724 +       return boot_cpu_data.x86_clflush_size;
71725 +}
71726 +
71727 +#define dma_is_consistent(h) 1
71728 +
71729 +extern int dma_set_mask(struct device *dev, u64 mask);
71730 +
71731 +static inline void
71732 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
71733 +{
71734 +       flush_write_buffers();
71735 +}
71736 +
71737 +extern struct device fallback_dev;
71738 +extern int panic_on_overflow;
71739 +#endif
71740 +
71741 +#endif /* _X8664_DMA_MAPPING_H */
71742 +
71743 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
71744 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/fixmap.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/fixmap.h
71745 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/fixmap.h 1970-01-01 01:00:00.000000000 +0100
71746 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/fixmap.h    2006-04-10 00:05:52.000000000 +0200
71747 @@ -0,0 +1,109 @@
71748 +/*
71749 + * fixmap.h: compile-time virtual memory allocation
71750 + *
71751 + * This file is subject to the terms and conditions of the GNU General Public
71752 + * License.  See the file "COPYING" in the main directory of this archive
71753 + * for more details.
71754 + *
71755 + * Copyright (C) 1998 Ingo Molnar
71756 + */
71757 +
71758 +#ifndef _ASM_FIXMAP_H
71759 +#define _ASM_FIXMAP_H
71760 +
71761 +#include <linux/config.h>
71762 +#include <linux/kernel.h>
71763 +#include <asm/apicdef.h>
71764 +#include <xen/gnttab.h>
71765 +#include <asm/page.h>
71766 +#include <asm/vsyscall.h>
71767 +#include <asm/vsyscall32.h>
71768 +#include <asm/acpi.h>
71769 +
71770 +/*
71771 + * Here we define all the compile-time 'special' virtual
71772 + * addresses. The point is to have a constant address at
71773 + * compile time, but to set the physical address only
71774 + * in the boot process.
71775 + *
71776 + * these 'compile-time allocated' memory buffers are
71777 + * fixed-size 4k pages. (or larger if used with an increment
71778 + * highger than 1) use fixmap_set(idx,phys) to associate
71779 + * physical memory with fixmap indices.
71780 + *
71781 + * TLB entries of such buffers will not be flushed across
71782 + * task switches.
71783 + */
71784 +
71785 +enum fixed_addresses {
71786 +       VSYSCALL_LAST_PAGE,
71787 +       VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
71788 +       VSYSCALL_HPET,
71789 +       FIX_HPET_BASE,
71790 +#ifdef CONFIG_X86_LOCAL_APIC
71791 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
71792 +#endif
71793 +#ifdef CONFIG_X86_IO_APIC
71794 +       FIX_IO_APIC_BASE_0,
71795 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
71796 +#endif
71797 +#ifdef CONFIG_ACPI
71798 +       FIX_ACPI_BEGIN,
71799 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
71800 +#endif
71801 +       FIX_SHARED_INFO,
71802 +#define NR_FIX_ISAMAPS 256
71803 +       FIX_ISAMAP_END,
71804 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
71805 +       __end_of_fixed_addresses
71806 +};
71807 +
71808 +extern void __set_fixmap (enum fixed_addresses idx,
71809 +                                       unsigned long phys, pgprot_t flags);
71810 +
71811 +#define set_fixmap(idx, phys) \
71812 +               __set_fixmap(idx, phys, PAGE_KERNEL)
71813 +/*
71814 + * Some hardware wants to get fixmapped without caching.
71815 + */
71816 +#define set_fixmap_nocache(idx, phys) \
71817 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
71818 +
71819 +#define clear_fixmap(idx) \
71820 +                __set_fixmap(idx, 0, __pgprot(0))
71821 +
71822 +#define FIXADDR_TOP    (VSYSCALL_END-PAGE_SIZE)
71823 +#define FIXADDR_SIZE   (__end_of_fixed_addresses << PAGE_SHIFT)
71824 +#define FIXADDR_START  (FIXADDR_TOP - FIXADDR_SIZE)
71825 +
71826 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
71827 +#define FIXADDR_USER_START     ((unsigned long)VSYSCALL32_VSYSCALL)
71828 +#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
71829 +
71830 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
71831 +
71832 +extern void __this_fixmap_does_not_exist(void);
71833 +
71834 +/*
71835 + * 'index to address' translation. If anyone tries to use the idx
71836 + * directly without translation, we catch the bug with a NULL-deference
71837 + * kernel oops. Illegal ranges of incoming indices are caught too.
71838 + */
71839 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
71840 +{
71841 +       /*
71842 +        * this branch gets completely eliminated after inlining,
71843 +        * except when someone tries to use fixaddr indices in an
71844 +        * illegal way. (such as mixing up address types or using
71845 +        * out-of-range indices).
71846 +        *
71847 +        * If it doesn't get removed, the linker will complain
71848 +        * loudly with a reasonably clear error message..
71849 +        */
71850 +       if (idx >= __end_of_fixed_addresses)
71851 +               __this_fixmap_does_not_exist();
71852 +
71853 +        return __fix_to_virt(idx);
71854 +}
71855 +
71856 +#endif
71857 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/floppy.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/floppy.h
71858 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/floppy.h 1970-01-01 01:00:00.000000000 +0100
71859 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/floppy.h    2006-04-10 00:05:52.000000000 +0200
71860 @@ -0,0 +1,206 @@
71861 +/*
71862 + * Architecture specific parts of the Floppy driver
71863 + *
71864 + * This file is subject to the terms and conditions of the GNU General Public
71865 + * License.  See the file "COPYING" in the main directory of this archive
71866 + * for more details.
71867 + *
71868 + * Copyright (C) 1995
71869 + *
71870 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
71871 + */
71872 +#ifndef __ASM_XEN_X86_64_FLOPPY_H
71873 +#define __ASM_XEN_X86_64_FLOPPY_H
71874 +
71875 +#include <linux/vmalloc.h>
71876 +
71877 +/*
71878 + * The DMA channel used by the floppy controller cannot access data at
71879 + * addresses >= 16MB
71880 + *
71881 + * Went back to the 1MB limit, as some people had problems with the floppy
71882 + * driver otherwise. It doesn't matter much for performance anyway, as most
71883 + * floppy accesses go through the track buffer.
71884 + */
71885 +#define _CROSS_64KB(a,s,vdma) \
71886 +(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
71887 +
71888 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
71889 +#include <asm/dma.h>
71890 +#undef MAX_DMA_ADDRESS
71891 +#define MAX_DMA_ADDRESS 0
71892 +#define CROSS_64KB(a,s) (0)
71893 +
71894 +#define fd_inb(port)                   inb_p(port)
71895 +#define fd_outb(value,port)            outb_p(value,port)
71896 +
71897 +#define fd_request_dma()        (0)
71898 +#define fd_free_dma()           ((void)0)
71899 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
71900 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
71901 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
71902 +#define fd_get_dma_residue()    vdma_get_dma_residue(FLOPPY_DMA)
71903 +/*
71904 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
71905 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
71906 + */
71907 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
71908 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
71909 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
71910 +
71911 +static int virtual_dma_count;
71912 +static int virtual_dma_residue;
71913 +static char *virtual_dma_addr;
71914 +static int virtual_dma_mode;
71915 +static int doing_pdma;
71916 +
71917 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
71918 +{
71919 +       register unsigned char st;
71920 +
71921 +#undef TRACE_FLPY_INT
71922 +
71923 +#ifdef TRACE_FLPY_INT
71924 +       static int calls=0;
71925 +       static int bytes=0;
71926 +       static int dma_wait=0;
71927 +#endif
71928 +       if (!doing_pdma)
71929 +               return floppy_interrupt(irq, dev_id, regs);
71930 +
71931 +#ifdef TRACE_FLPY_INT
71932 +       if(!calls)
71933 +               bytes = virtual_dma_count;
71934 +#endif
71935 +
71936 +       {
71937 +               register int lcount;
71938 +               register char *lptr;
71939 +
71940 +               st = 1;
71941 +               for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
71942 +                   lcount; lcount--, lptr++) {
71943 +                       st=inb(virtual_dma_port+4) & 0xa0 ;
71944 +                       if(st != 0xa0) 
71945 +                               break;
71946 +                       if(virtual_dma_mode)
71947 +                               outb_p(*lptr, virtual_dma_port+5);
71948 +                       else
71949 +                               *lptr = inb_p(virtual_dma_port+5);
71950 +               }
71951 +               virtual_dma_count = lcount;
71952 +               virtual_dma_addr = lptr;
71953 +               st = inb(virtual_dma_port+4);
71954 +       }
71955 +
71956 +#ifdef TRACE_FLPY_INT
71957 +       calls++;
71958 +#endif
71959 +       if(st == 0x20)
71960 +               return IRQ_HANDLED;
71961 +       if(!(st & 0x20)) {
71962 +               virtual_dma_residue += virtual_dma_count;
71963 +               virtual_dma_count=0;
71964 +#ifdef TRACE_FLPY_INT
71965 +               printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", 
71966 +                      virtual_dma_count, virtual_dma_residue, calls, bytes,
71967 +                      dma_wait);
71968 +               calls = 0;
71969 +               dma_wait=0;
71970 +#endif
71971 +               doing_pdma = 0;
71972 +               floppy_interrupt(irq, dev_id, regs);
71973 +               return IRQ_HANDLED;
71974 +       }
71975 +#ifdef TRACE_FLPY_INT
71976 +       if(!virtual_dma_count)
71977 +               dma_wait++;
71978 +#endif
71979 +       return IRQ_HANDLED;
71980 +}
71981 +
71982 +static void fd_disable_dma(void)
71983 +{
71984 +       doing_pdma = 0;
71985 +       virtual_dma_residue += virtual_dma_count;
71986 +       virtual_dma_count=0;
71987 +}
71988 +
71989 +static int vdma_get_dma_residue(unsigned int dummy)
71990 +{
71991 +       return virtual_dma_count + virtual_dma_residue;
71992 +}
71993 +
71994 +
71995 +static int fd_request_irq(void)
71996 +{
71997 +       return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
71998 +                                          "floppy", NULL);
71999 +}
72000 +
72001 +#if 0
72002 +static unsigned long vdma_mem_alloc(unsigned long size)
72003 +{
72004 +       return (unsigned long) vmalloc(size);
72005 +
72006 +}
72007 +
72008 +static void vdma_mem_free(unsigned long addr, unsigned long size)
72009 +{
72010 +       vfree((void *)addr);
72011 +}
72012 +#endif
72013 +
72014 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
72015 +{
72016 +       doing_pdma = 1;
72017 +       virtual_dma_port = io;
72018 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
72019 +       virtual_dma_addr = addr;
72020 +       virtual_dma_count = size;
72021 +       virtual_dma_residue = 0;
72022 +       return 0;
72023 +}
72024 +
72025 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
72026 +#define FDC1 xen_floppy_init()
72027 +static int FDC2 = -1;
72028 +
72029 +static int xen_floppy_init(void)
72030 +{
72031 +       use_virtual_dma = 1;
72032 +       can_use_virtual_dma = 1;
72033 +       return 0x3f0;
72034 +}
72035 +
72036 +/*
72037 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
72038 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
72039 + * coincides with another rtc CMOS user.               Paul G.
72040 + */
72041 +#define FLOPPY0_TYPE   ({                              \
72042 +       unsigned long flags;                            \
72043 +       unsigned char val;                              \
72044 +       spin_lock_irqsave(&rtc_lock, flags);            \
72045 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
72046 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
72047 +       val;                                            \
72048 +})
72049 +
72050 +#define FLOPPY1_TYPE   ({                              \
72051 +       unsigned long flags;                            \
72052 +       unsigned char val;                              \
72053 +       spin_lock_irqsave(&rtc_lock, flags);            \
72054 +       val = CMOS_READ(0x10) & 15;                     \
72055 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
72056 +       val;                                            \
72057 +})
72058 +
72059 +#define N_FDC 2
72060 +#define N_DRIVE 8
72061 +
72062 +#define FLOPPY_MOTOR_MASK 0xf0
72063 +
72064 +#define EXTRA_FLOPPY_PARAMS
72065 +
72066 +#endif /* __ASM_XEN_X86_64_FLOPPY_H */
72067 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/hw_irq.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/hw_irq.h
72068 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/hw_irq.h 1970-01-01 01:00:00.000000000 +0100
72069 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/hw_irq.h    2006-04-10 00:05:52.000000000 +0200
72070 @@ -0,0 +1,141 @@
72071 +#ifndef _ASM_HW_IRQ_H
72072 +#define _ASM_HW_IRQ_H
72073 +
72074 +/*
72075 + *     linux/include/asm/hw_irq.h
72076 + *
72077 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
72078 + *
72079 + *     moved some of the old arch/i386/kernel/irq.h to here. VY
72080 + *
72081 + *     IRQ/IPI changes taken from work by Thomas Radke
72082 + *     <tomsoft@informatik.tu-chemnitz.de>
72083 + *
72084 + *     hacked by Andi Kleen for x86-64.
72085 + * 
72086 + *  $Id$
72087 + */
72088 +
72089 +#ifndef __ASSEMBLY__
72090 +#include <linux/config.h>
72091 +#include <asm/atomic.h>
72092 +#include <asm/irq.h>
72093 +#include <linux/profile.h>
72094 +#include <linux/smp.h>
72095 +
72096 +struct hw_interrupt_type;
72097 +#endif
72098 +
72099 +#define NMI_VECTOR             0x02
72100 +/*
72101 + * IDT vectors usable for external interrupt sources start
72102 + * at 0x20:
72103 + */
72104 +#define FIRST_EXTERNAL_VECTOR  0x20
72105 +
72106 +#define IA32_SYSCALL_VECTOR    0x80
72107 +
72108 +
72109 +/*
72110 + * Vectors 0x20-0x2f are used for ISA interrupts.
72111 + */
72112 +
72113 +/*
72114 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
72115 + *
72116 + *  some of the following vectors are 'rare', they are merged
72117 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
72118 + *  TLB, reschedule and local APIC vectors are performance-critical.
72119 + */
72120 +#ifndef CONFIG_XEN
72121 +#define SPURIOUS_APIC_VECTOR   0xff
72122 +#define ERROR_APIC_VECTOR      0xfe
72123 +#define RESCHEDULE_VECTOR      0xfd
72124 +#define CALL_FUNCTION_VECTOR   0xfc
72125 +/* fb free - please don't readd KDB here because it's useless
72126 +   (hint - think what a NMI bit does to a vector) */
72127 +#define THERMAL_APIC_VECTOR    0xfa
72128 +#define THRESHOLD_APIC_VECTOR   0xf9
72129 +/* f8 free */
72130 +#define INVALIDATE_TLB_VECTOR_END      0xf7
72131 +#define INVALIDATE_TLB_VECTOR_START    0xf0    /* f0-f7 used for TLB flush */
72132 +
72133 +#define NUM_INVALIDATE_TLB_VECTORS     8
72134 +#endif
72135 +
72136 +/*
72137 + * Local APIC timer IRQ vector is on a different priority level,
72138 + * to work around the 'lost local interrupt if more than 2 IRQ
72139 + * sources per level' errata.
72140 + */
72141 +#define LOCAL_TIMER_VECTOR     0xef
72142 +
72143 +/*
72144 + * First APIC vector available to drivers: (vectors 0x30-0xee)
72145 + * we start at 0x31 to spread out vectors evenly between priority
72146 + * levels. (0x80 is the syscall vector)
72147 + */
72148 +#define FIRST_DEVICE_VECTOR    0x31
72149 +#define FIRST_SYSTEM_VECTOR    0xef   /* duplicated in irq.h */
72150 +
72151 +
72152 +#ifndef __ASSEMBLY__
72153 +extern u8 irq_vector[NR_IRQ_VECTORS];
72154 +#define IO_APIC_VECTOR(irq)    (irq_vector[irq])
72155 +#define AUTO_ASSIGN            -1
72156 +
72157 +/*
72158 + * Various low-level irq details needed by irq.c, process.c,
72159 + * time.c, io_apic.c and smp.c
72160 + *
72161 + * Interrupt entry/exit code at both C and assembly level
72162 + */
72163 +
72164 +extern void disable_8259A_irq(unsigned int irq);
72165 +extern void enable_8259A_irq(unsigned int irq);
72166 +extern int i8259A_irq_pending(unsigned int irq);
72167 +extern void make_8259A_irq(unsigned int irq);
72168 +extern void init_8259A(int aeoi);
72169 +extern void FASTCALL(send_IPI_self(int vector));
72170 +extern void init_VISWS_APIC_irqs(void);
72171 +extern void setup_IO_APIC(void);
72172 +extern void disable_IO_APIC(void);
72173 +extern void print_IO_APIC(void);
72174 +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
72175 +extern void send_IPI(int dest, int vector);
72176 +extern void setup_ioapic_dest(void);
72177 +
72178 +extern unsigned long io_apic_irqs;
72179 +
72180 +extern atomic_t irq_err_count;
72181 +extern atomic_t irq_mis_count;
72182 +
72183 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
72184 +
72185 +#define __STR(x) #x
72186 +#define STR(x) __STR(x)
72187 +
72188 +#include <asm/ptrace.h>
72189 +
72190 +#define IRQ_NAME2(nr) nr##_interrupt(void)
72191 +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
72192 +
72193 +/*
72194 + *     SMP has a few special interrupts for IPI messages
72195 + */
72196 +
72197 +#define BUILD_IRQ(nr) \
72198 +asmlinkage void IRQ_NAME(nr); \
72199 +__asm__( \
72200 +"\n.p2align\n" \
72201 +"IRQ" #nr "_interrupt:\n\t" \
72202 +       "push $" #nr "-256 ; " \
72203 +       "jmp common_interrupt");
72204 +
72205 +extern void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i);
72206 +
72207 +#define platform_legacy_irq(irq)       ((irq) < 16)
72208 +
72209 +#endif
72210 +
72211 +#endif /* _ASM_HW_IRQ_H */
72212 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/hypercall.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/hypercall.h
72213 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/hypercall.h      1970-01-01 01:00:00.000000000 +0100
72214 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/hypercall.h 2006-04-10 00:05:52.000000000 +0200
72215 @@ -0,0 +1,343 @@
72216 +/******************************************************************************
72217 + * hypercall.h
72218 + * 
72219 + * Linux-specific hypervisor handling.
72220 + * 
72221 + * Copyright (c) 2002-2004, K A Fraser
72222 + * 
72223 + * 64-bit updates:
72224 + *   Benjamin Liu <benjamin.liu@intel.com>
72225 + *   Jun Nakajima <jun.nakajima@intel.com>
72226 + * 
72227 + * This program is free software; you can redistribute it and/or
72228 + * modify it under the terms of the GNU General Public License version 2
72229 + * as published by the Free Software Foundation; or, when distributed
72230 + * separately from the Linux kernel or incorporated into other
72231 + * software packages, subject to the following license:
72232 + * 
72233 + * Permission is hereby granted, free of charge, to any person obtaining a copy
72234 + * of this source file (the "Software"), to deal in the Software without
72235 + * restriction, including without limitation the rights to use, copy, modify,
72236 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
72237 + * and to permit persons to whom the Software is furnished to do so, subject to
72238 + * the following conditions:
72239 + * 
72240 + * The above copyright notice and this permission notice shall be included in
72241 + * all copies or substantial portions of the Software.
72242 + * 
72243 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
72244 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
72245 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
72246 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
72247 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
72248 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
72249 + * IN THE SOFTWARE.
72250 + */
72251 +
72252 +#ifndef __HYPERCALL_H__
72253 +#define __HYPERCALL_H__
72254 +
72255 +#ifndef __HYPERVISOR_H__
72256 +# error "please don't include this file directly"
72257 +#endif
72258 +
72259 +#define __STR(x) #x
72260 +#define STR(x) __STR(x)
72261 +
72262 +#define _hypercall0(type, name)                        \
72263 +({                                             \
72264 +       long __res;                             \
72265 +       asm volatile (                          \
72266 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72267 +               : "=a" (__res)                  \
72268 +               :                               \
72269 +               : "memory" );                   \
72270 +       (type)__res;                            \
72271 +})
72272 +
72273 +#define _hypercall1(type, name, a1)                            \
72274 +({                                                             \
72275 +       long __res, __ign1;                                     \
72276 +       asm volatile (                                          \
72277 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72278 +               : "=a" (__res), "=D" (__ign1)                   \
72279 +               : "1" ((long)(a1))                              \
72280 +               : "memory" );                                   \
72281 +       (type)__res;                                            \
72282 +})
72283 +
72284 +#define _hypercall2(type, name, a1, a2)                                \
72285 +({                                                             \
72286 +       long __res, __ign1, __ign2;                             \
72287 +       asm volatile (                                          \
72288 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72289 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2)    \
72290 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
72291 +               : "memory" );                                   \
72292 +       (type)__res;                                            \
72293 +})
72294 +
72295 +#define _hypercall3(type, name, a1, a2, a3)                    \
72296 +({                                                             \
72297 +       long __res, __ign1, __ign2, __ign3;                     \
72298 +       asm volatile (                                          \
72299 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72300 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
72301 +               "=d" (__ign3)                                   \
72302 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
72303 +               "3" ((long)(a3))                                \
72304 +               : "memory" );                                   \
72305 +       (type)__res;                                            \
72306 +})
72307 +
72308 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
72309 +({                                                             \
72310 +       long __res, __ign1, __ign2, __ign3;                     \
72311 +       asm volatile (                                          \
72312 +               "movq %7,%%r10; "                               \
72313 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72314 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
72315 +               "=d" (__ign3)                                   \
72316 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
72317 +               "3" ((long)(a3)), "g" ((long)(a4))              \
72318 +               : "memory", "r10" );                            \
72319 +       (type)__res;                                            \
72320 +})
72321 +
72322 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
72323 +({                                                             \
72324 +       long __res, __ign1, __ign2, __ign3;                     \
72325 +       asm volatile (                                          \
72326 +               "movq %7,%%r10; movq %8,%%r8; "                 \
72327 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72328 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
72329 +               "=d" (__ign3)                                   \
72330 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
72331 +               "3" ((long)(a3)), "g" ((long)(a4)),             \
72332 +               "g" ((long)(a5))                                \
72333 +               : "memory", "r10", "r8" );                      \
72334 +       (type)__res;                                            \
72335 +})
72336 +
72337 +static inline int
72338 +HYPERVISOR_set_trap_table(
72339 +       trap_info_t *table)
72340 +{
72341 +       return _hypercall1(int, set_trap_table, table);
72342 +}
72343 +
72344 +static inline int
72345 +HYPERVISOR_mmu_update(
72346 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
72347 +{
72348 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
72349 +}
72350 +
72351 +static inline int
72352 +HYPERVISOR_mmuext_op(
72353 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
72354 +{
72355 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
72356 +}
72357 +
72358 +static inline int
72359 +HYPERVISOR_set_gdt(
72360 +       unsigned long *frame_list, int entries)
72361 +{
72362 +       return _hypercall2(int, set_gdt, frame_list, entries);
72363 +}
72364 +
72365 +static inline int
72366 +HYPERVISOR_stack_switch(
72367 +       unsigned long ss, unsigned long esp)
72368 +{
72369 +       return _hypercall2(int, stack_switch, ss, esp);
72370 +}
72371 +
72372 +static inline int
72373 +HYPERVISOR_set_callbacks(
72374 +       unsigned long event_address, unsigned long failsafe_address, 
72375 +       unsigned long syscall_address)
72376 +{
72377 +       return _hypercall3(int, set_callbacks,
72378 +                          event_address, failsafe_address, syscall_address);
72379 +}
72380 +
72381 +static inline int
72382 +HYPERVISOR_fpu_taskswitch(
72383 +       int set)
72384 +{
72385 +       return _hypercall1(int, fpu_taskswitch, set);
72386 +}
72387 +
72388 +static inline int
72389 +HYPERVISOR_sched_op_compat(
72390 +       int cmd, unsigned long arg)
72391 +{
72392 +       return _hypercall2(int, sched_op_compat, cmd, arg);
72393 +}
72394 +
72395 +static inline int
72396 +HYPERVISOR_sched_op(
72397 +       int cmd, void *arg)
72398 +{
72399 +       return _hypercall2(int, sched_op, cmd, arg);
72400 +}
72401 +
72402 +static inline long
72403 +HYPERVISOR_set_timer_op(
72404 +       u64 timeout)
72405 +{
72406 +       return _hypercall1(long, set_timer_op, timeout);
72407 +}
72408 +
72409 +static inline int
72410 +HYPERVISOR_dom0_op(
72411 +       dom0_op_t *dom0_op)
72412 +{
72413 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
72414 +       return _hypercall1(int, dom0_op, dom0_op);
72415 +}
72416 +
72417 +static inline int
72418 +HYPERVISOR_set_debugreg(
72419 +       int reg, unsigned long value)
72420 +{
72421 +       return _hypercall2(int, set_debugreg, reg, value);
72422 +}
72423 +
72424 +static inline unsigned long
72425 +HYPERVISOR_get_debugreg(
72426 +       int reg)
72427 +{
72428 +       return _hypercall1(unsigned long, get_debugreg, reg);
72429 +}
72430 +
72431 +static inline int
72432 +HYPERVISOR_update_descriptor(
72433 +       unsigned long ma, unsigned long word)
72434 +{
72435 +       return _hypercall2(int, update_descriptor, ma, word);
72436 +}
72437 +
72438 +static inline int
72439 +HYPERVISOR_memory_op(
72440 +       unsigned int cmd, void *arg)
72441 +{
72442 +       return _hypercall2(int, memory_op, cmd, arg);
72443 +}
72444 +
72445 +static inline int
72446 +HYPERVISOR_multicall(
72447 +       void *call_list, int nr_calls)
72448 +{
72449 +       return _hypercall2(int, multicall, call_list, nr_calls);
72450 +}
72451 +
72452 +static inline int
72453 +HYPERVISOR_update_va_mapping(
72454 +       unsigned long va, pte_t new_val, unsigned long flags)
72455 +{
72456 +       return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
72457 +}
72458 +
72459 +static inline int
72460 +HYPERVISOR_event_channel_op(
72461 +       void *op)
72462 +{
72463 +       return _hypercall1(int, event_channel_op, op);
72464 +}
72465 +
72466 +static inline int
72467 +HYPERVISOR_xen_version(
72468 +       int cmd, void *arg)
72469 +{
72470 +       return _hypercall2(int, xen_version, cmd, arg);
72471 +}
72472 +
72473 +static inline int
72474 +HYPERVISOR_console_io(
72475 +       int cmd, int count, char *str)
72476 +{
72477 +       return _hypercall3(int, console_io, cmd, count, str);
72478 +}
72479 +
72480 +static inline int
72481 +HYPERVISOR_physdev_op(
72482 +       void *physdev_op)
72483 +{
72484 +       return _hypercall1(int, physdev_op, physdev_op);
72485 +}
72486 +
72487 +static inline int
72488 +HYPERVISOR_grant_table_op(
72489 +       unsigned int cmd, void *uop, unsigned int count)
72490 +{
72491 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
72492 +}
72493 +
72494 +static inline int
72495 +HYPERVISOR_update_va_mapping_otherdomain(
72496 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
72497 +{
72498 +       return _hypercall4(int, update_va_mapping_otherdomain, va,
72499 +                          new_val.pte, flags, domid);
72500 +}
72501 +
72502 +static inline int
72503 +HYPERVISOR_vm_assist(
72504 +       unsigned int cmd, unsigned int type)
72505 +{
72506 +       return _hypercall2(int, vm_assist, cmd, type);
72507 +}
72508 +
72509 +static inline int
72510 +HYPERVISOR_vcpu_op(
72511 +       int cmd, int vcpuid, void *extra_args)
72512 +{
72513 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
72514 +}
72515 +
72516 +static inline int
72517 +HYPERVISOR_set_segment_base(
72518 +       int reg, unsigned long value)
72519 +{
72520 +       return _hypercall2(int, set_segment_base, reg, value);
72521 +}
72522 +
72523 +static inline int
72524 +HYPERVISOR_suspend(
72525 +       unsigned long srec)
72526 +{
72527 +       struct sched_shutdown sched_shutdown = {
72528 +               .reason = SHUTDOWN_suspend
72529 +       };
72530 +
72531 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
72532 +                            &sched_shutdown, srec);
72533 +
72534 +       if (rc == -ENOSYS)
72535 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
72536 +                                SHUTDOWN_suspend, srec);
72537 +
72538 +       return rc;
72539 +}
72540 +
72541 +static inline int
72542 +HYPERVISOR_nmi_op(
72543 +       unsigned long op, void *arg)
72544 +{
72545 +       return _hypercall2(int, nmi_op, op, arg);
72546 +}
72547 +
72548 +#endif /* __HYPERCALL_H__ */
72549 +
72550 +/*
72551 + * Local variables:
72552 + *  c-file-style: "linux"
72553 + *  indent-tabs-mode: t
72554 + *  c-indent-level: 8
72555 + *  c-basic-offset: 8
72556 + *  tab-width: 8
72557 + * End:
72558 + */
72559 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/hypervisor.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/hypervisor.h
72560 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/hypervisor.h     1970-01-01 01:00:00.000000000 +0100
72561 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/hypervisor.h        2006-04-10 00:05:52.000000000 +0200
72562 @@ -0,0 +1,2 @@
72563 +
72564 +#include <asm-i386/mach-xen/asm/hypervisor.h>
72565 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/io.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/io.h
72566 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/io.h     1970-01-01 01:00:00.000000000 +0100
72567 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/io.h        2006-04-10 00:05:52.000000000 +0200
72568 @@ -0,0 +1,359 @@
72569 +#ifndef _ASM_IO_H
72570 +#define _ASM_IO_H
72571 +
72572 +#include <linux/config.h>
72573 +#include <asm/fixmap.h>
72574 +
72575 +/*
72576 + * This file contains the definitions for the x86 IO instructions
72577 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
72578 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
72579 + * versions of the single-IO instructions (inb_p/inw_p/..).
72580 + *
72581 + * This file is not meant to be obfuscating: it's just complicated
72582 + * to (a) handle it all in a way that makes gcc able to optimize it
72583 + * as well as possible and (b) trying to avoid writing the same thing
72584 + * over and over again with slight variations and possibly making a
72585 + * mistake somewhere.
72586 + */
72587 +
72588 +/*
72589 + * Thanks to James van Artsdalen for a better timing-fix than
72590 + * the two short jumps: using outb's to a nonexistent port seems
72591 + * to guarantee better timings even on fast machines.
72592 + *
72593 + * On the other hand, I'd like to be sure of a non-existent port:
72594 + * I feel a bit unsafe about using 0x80 (should be safe, though)
72595 + *
72596 + *             Linus
72597 + */
72598 +
72599 + /*
72600 +  *  Bit simplified and optimized by Jan Hubicka
72601 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
72602 +  *
72603 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
72604 +  *  isa_read[wl] and isa_write[wl] fixed
72605 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
72606 +  */
72607 +
72608 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
72609 +
72610 +#ifdef REALLY_SLOW_IO
72611 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
72612 +#else
72613 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
72614 +#endif
72615 +
72616 +/*
72617 + * Talk about misusing macros..
72618 + */
72619 +#define __OUT1(s,x) \
72620 +static inline void out##s(unsigned x value, unsigned short port) {
72621 +
72622 +#define __OUT2(s,s1,s2) \
72623 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
72624 +
72625 +#define __OUT(s,s1,x) \
72626 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
72627 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
72628 +
72629 +#define __IN1(s) \
72630 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
72631 +
72632 +#define __IN2(s,s1,s2) \
72633 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
72634 +
72635 +#define __IN(s,s1,i...) \
72636 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
72637 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
72638 +
72639 +#define __INS(s) \
72640 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
72641 +{ __asm__ __volatile__ ("rep ; ins" #s \
72642 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
72643 +
72644 +#define __OUTS(s) \
72645 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
72646 +{ __asm__ __volatile__ ("rep ; outs" #s \
72647 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
72648 +
72649 +#define RETURN_TYPE unsigned char
72650 +__IN(b,"")
72651 +#undef RETURN_TYPE
72652 +#define RETURN_TYPE unsigned short
72653 +__IN(w,"")
72654 +#undef RETURN_TYPE
72655 +#define RETURN_TYPE unsigned int
72656 +__IN(l,"")
72657 +#undef RETURN_TYPE
72658 +
72659 +__OUT(b,"b",char)
72660 +__OUT(w,"w",short)
72661 +__OUT(l,,int)
72662 +
72663 +__INS(b)
72664 +__INS(w)
72665 +__INS(l)
72666 +
72667 +__OUTS(b)
72668 +__OUTS(w)
72669 +__OUTS(l)
72670 +
72671 +#define IO_SPACE_LIMIT 0xffff
72672 +
72673 +#if defined(__KERNEL__) && __x86_64__
72674 +
72675 +#include <linux/vmalloc.h>
72676 +
72677 +#ifndef __i386__
72678 +/*
72679 + * Change virtual addresses to physical addresses and vv.
72680 + * These are pretty trivial
72681 + */
72682 +static inline unsigned long virt_to_phys(volatile void * address)
72683 +{
72684 +       return __pa(address);
72685 +}
72686 +
72687 +static inline void * phys_to_virt(unsigned long address)
72688 +{
72689 +       return __va(address);
72690 +}
72691 +
72692 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
72693 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
72694 +#endif
72695 +
72696 +/*
72697 + * Change "struct page" to physical address.
72698 + */
72699 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
72700 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
72701 +
72702 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
72703 +                                 (unsigned long) bio_offset((bio)))
72704 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
72705 +                                 (unsigned long) (bv)->bv_offset)
72706 +
72707 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
72708 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
72709 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
72710 +         bvec_to_pseudophys((vec2))))
72711 +
72712 +#include <asm-generic/iomap.h>
72713 +
72714 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
72715 +
72716 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
72717 +{
72718 +       return __ioremap(offset, size, 0);
72719 +}
72720 +
72721 +/*
72722 + * This one maps high address device memory and turns off caching for that area.
72723 + * it's useful if some control registers are in such an area and write combining
72724 + * or read caching is not desirable:
72725 + */
72726 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
72727 +extern void iounmap(volatile void __iomem *addr);
72728 +
72729 +/* Use normal IO mappings for DMI */
72730 +#define dmi_ioremap ioremap
72731 +#define dmi_iounmap(x,l) iounmap(x)
72732 +#define dmi_alloc(l) kmalloc(l, GFP_ATOMIC)
72733 +
72734 +/*
72735 + * ISA I/O bus memory addresses are 1:1 with the physical address.
72736 + */
72737 +
72738 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
72739 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
72740 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
72741 +
72742 +/*
72743 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
72744 + * are forbidden in portable PCI drivers.
72745 + *
72746 + * Allow them on x86 for legacy drivers, though.
72747 + */
72748 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
72749 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
72750 +
72751 +/*
72752 + * readX/writeX() are used to access memory mapped devices. On some
72753 + * architectures the memory mapped IO stuff needs to be accessed
72754 + * differently. On the x86 architecture, we just read/write the
72755 + * memory location directly.
72756 + */
72757 +
72758 +static inline __u8 __readb(const volatile void __iomem *addr)
72759 +{
72760 +       return *(__force volatile __u8 *)addr;
72761 +}
72762 +static inline __u16 __readw(const volatile void __iomem *addr)
72763 +{
72764 +       return *(__force volatile __u16 *)addr;
72765 +}
72766 +static inline __u32 __readl(const volatile void __iomem *addr)
72767 +{
72768 +       return *(__force volatile __u32 *)addr;
72769 +}
72770 +static inline __u64 __readq(const volatile void __iomem *addr)
72771 +{
72772 +       return *(__force volatile __u64 *)addr;
72773 +}
72774 +#define readb(x) __readb(x)
72775 +#define readw(x) __readw(x)
72776 +#define readl(x) __readl(x)
72777 +#define readq(x) __readq(x)
72778 +#define readb_relaxed(a) readb(a)
72779 +#define readw_relaxed(a) readw(a)
72780 +#define readl_relaxed(a) readl(a)
72781 +#define readq_relaxed(a) readq(a)
72782 +#define __raw_readb readb
72783 +#define __raw_readw readw
72784 +#define __raw_readl readl
72785 +#define __raw_readq readq
72786 +
72787 +#define mmiowb()
72788 +
72789 +#ifdef CONFIG_UNORDERED_IO
72790 +static inline void __writel(__u32 val, volatile void __iomem *addr)
72791 +{
72792 +       volatile __u32 __iomem *target = addr;
72793 +       asm volatile("movnti %1,%0"
72794 +                    : "=m" (*target)
72795 +                    : "r" (val) : "memory");
72796 +}
72797 +
72798 +static inline void __writeq(__u64 val, volatile void __iomem *addr)
72799 +{
72800 +       volatile __u64 __iomem *target = addr;
72801 +       asm volatile("movnti %1,%0"
72802 +                    : "=m" (*target)
72803 +                    : "r" (val) : "memory");
72804 +}
72805 +#else
72806 +static inline void __writel(__u32 b, volatile void __iomem *addr)
72807 +{
72808 +       *(__force volatile __u32 *)addr = b;
72809 +}
72810 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
72811 +{
72812 +       *(__force volatile __u64 *)addr = b;
72813 +}
72814 +#endif
72815 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
72816 +{
72817 +       *(__force volatile __u8 *)addr = b;
72818 +}
72819 +static inline void __writew(__u16 b, volatile void __iomem *addr)
72820 +{
72821 +       *(__force volatile __u16 *)addr = b;
72822 +}
72823 +#define writeq(val,addr) __writeq((val),(addr))
72824 +#define writel(val,addr) __writel((val),(addr))
72825 +#define writew(val,addr) __writew((val),(addr))
72826 +#define writeb(val,addr) __writeb((val),(addr))
72827 +#define __raw_writeb writeb
72828 +#define __raw_writew writew
72829 +#define __raw_writel writel
72830 +#define __raw_writeq writeq
72831 +
72832 +void __memcpy_fromio(void*,unsigned long,unsigned);
72833 +void __memcpy_toio(unsigned long,const void*,unsigned);
72834 +
72835 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
72836 +{
72837 +       __memcpy_fromio(to,(unsigned long)from,len);
72838 +}
72839 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
72840 +{
72841 +       __memcpy_toio((unsigned long)to,from,len);
72842 +}
72843 +
72844 +void memset_io(volatile void __iomem *a, int b, size_t c);
72845 +
72846 +/*
72847 + * ISA space is 'always mapped' on a typical x86 system, no need to
72848 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
72849 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
72850 + * are physical addresses. The following constant pointer can be
72851 + * used as the IO-area pointer (it can be iounmapped as well, so the
72852 + * analogy with PCI is quite large):
72853 + */
72854 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
72855 +
72856 +#define isa_readb(a) readb(__ISA_IO_base + (a))
72857 +#define isa_readw(a) readw(__ISA_IO_base + (a))
72858 +#define isa_readl(a) readl(__ISA_IO_base + (a))
72859 +#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
72860 +#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
72861 +#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
72862 +#define isa_memset_io(a,b,c)           memset_io(__ISA_IO_base + (a),(b),(c))
72863 +#define isa_memcpy_fromio(a,b,c)       memcpy_fromio((a),__ISA_IO_base + (b),(c))
72864 +#define isa_memcpy_toio(a,b,c)         memcpy_toio(__ISA_IO_base + (a),(b),(c))
72865 +
72866 +
72867 +/*
72868 + * Again, x86-64 does not require mem IO specific function.
72869 + */
72870 +
72871 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void *)(b),(c),(d))
72872 +#define isa_eth_io_copy_and_sum(a,b,c,d)       eth_copy_and_sum((a),(void *)(__ISA_IO_base + (b)),(c),(d))
72873 +
72874 +/**
72875 + *     check_signature         -       find BIOS signatures
72876 + *     @io_addr: mmio address to check 
72877 + *     @signature:  signature block
72878 + *     @length: length of signature
72879 + *
72880 + *     Perform a signature comparison with the mmio address io_addr. This
72881 + *     address should have been obtained by ioremap.
72882 + *     Returns 1 on a match.
72883 + */
72884
72885 +static inline int check_signature(void __iomem *io_addr,
72886 +       const unsigned char *signature, int length)
72887 +{
72888 +       int retval = 0;
72889 +       do {
72890 +               if (readb(io_addr) != *signature)
72891 +                       goto out;
72892 +               io_addr++;
72893 +               signature++;
72894 +               length--;
72895 +       } while (length);
72896 +       retval = 1;
72897 +out:
72898 +       return retval;
72899 +}
72900 +
72901 +/* Nothing to do */
72902 +
72903 +#define dma_cache_inv(_start,_size)            do { } while (0)
72904 +#define dma_cache_wback(_start,_size)          do { } while (0)
72905 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
72906 +
72907 +#define flush_write_buffers() 
72908 +
72909 +extern int iommu_bio_merge;
72910 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
72911 +
72912 +/*
72913 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
72914 + * access
72915 + */
72916 +#define xlate_dev_mem_ptr(p)   __va(p)
72917 +
72918 +/*
72919 + * Convert a virtual cached pointer to an uncached pointer
72920 + */
72921 +#define xlate_dev_kmem_ptr(p)  p
72922 +
72923 +#endif /* __KERNEL__ */
72924 +
72925 +#define ARCH_HAS_DEV_MEM
72926 +
72927 +#endif
72928 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/irq.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/irq.h
72929 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/irq.h    1970-01-01 01:00:00.000000000 +0100
72930 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/irq.h       2006-04-10 00:05:52.000000000 +0200
72931 @@ -0,0 +1,39 @@
72932 +#ifndef _ASM_IRQ_H
72933 +#define _ASM_IRQ_H
72934 +
72935 +/*
72936 + *     linux/include/asm/irq.h
72937 + *
72938 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
72939 + *
72940 + *     IRQ/IPI changes taken from work by Thomas Radke
72941 + *     <tomsoft@informatik.tu-chemnitz.de>
72942 + */
72943 +
72944 +#include <linux/config.h>
72945 +#include <linux/sched.h>
72946 +/* include comes from machine specific directory */
72947 +#include "irq_vectors.h"
72948 +#include <asm/thread_info.h>
72949 +
72950 +static __inline__ int irq_canonicalize(int irq)
72951 +{
72952 +       return ((irq == 2) ? 9 : irq);
72953 +}
72954 +
72955 +#ifdef CONFIG_X86_LOCAL_APIC
72956 +#define ARCH_HAS_NMI_WATCHDOG          /* See include/linux/nmi.h */
72957 +#endif
72958 +
72959 +#define KDB_VECTOR     0xf9
72960 +
72961 +# define irq_ctx_init(cpu) do { } while (0)
72962 +
72963 +#ifdef CONFIG_HOTPLUG_CPU
72964 +#include <linux/cpumask.h>
72965 +extern void fixup_irqs(cpumask_t map);
72966 +#endif
72967 +
72968 +#define __ARCH_HAS_DO_SOFTIRQ 1
72969 +
72970 +#endif /* _ASM_IRQ_H */
72971 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/mmu_context.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/mmu_context.h
72972 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/mmu_context.h    1970-01-01 01:00:00.000000000 +0100
72973 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/mmu_context.h       2006-04-10 00:05:52.000000000 +0200
72974 @@ -0,0 +1,134 @@
72975 +#ifndef __X86_64_MMU_CONTEXT_H
72976 +#define __X86_64_MMU_CONTEXT_H
72977 +
72978 +#include <linux/config.h>
72979 +#include <asm/desc.h>
72980 +#include <asm/atomic.h>
72981 +#include <asm/pgalloc.h>
72982 +#include <asm/page.h>
72983 +#include <asm/pda.h>
72984 +#include <asm/pgtable.h>
72985 +#include <asm/tlbflush.h>
72986 +
72987 +/*
72988 + * possibly do the LDT unload here?
72989 + */
72990 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
72991 +void destroy_context(struct mm_struct *mm);
72992 +
72993 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
72994 +{
72995 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
72996 +       if (read_pda(mmu_state) == TLBSTATE_OK) 
72997 +               write_pda(mmu_state, TLBSTATE_LAZY);
72998 +#endif
72999 +}
73000 +
73001 +#define prepare_arch_switch(next)      __prepare_arch_switch()
73002 +
73003 +static inline void __prepare_arch_switch(void)
73004 +{
73005 +       /*
73006 +        * Save away %es, %ds, %fs and %gs. Must happen before reload
73007 +        * of cr3/ldt (i.e., not in __switch_to).
73008 +        */
73009 +       __asm__ __volatile__ (
73010 +               "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
73011 +               : "=m" (current->thread.es),
73012 +                 "=m" (current->thread.ds),
73013 +                 "=m" (current->thread.fsindex),
73014 +                 "=m" (current->thread.gsindex) );
73015 +
73016 +       if (current->thread.ds)
73017 +               __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
73018 +
73019 +       if (current->thread.es)
73020 +               __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
73021 +
73022 +       if (current->thread.fsindex) {
73023 +               __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
73024 +               current->thread.fs = 0;
73025 +       }
73026 +
73027 +       if (current->thread.gsindex) {
73028 +               load_gs_index(0);
73029 +               current->thread.gs = 0;
73030 +       }
73031 +}
73032 +
73033 +extern void mm_pin(struct mm_struct *mm);
73034 +extern void mm_unpin(struct mm_struct *mm);
73035 +void mm_pin_all(void);
73036 +
73037 +static inline void load_cr3(pgd_t *pgd)
73038 +{
73039 +       asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
73040 +                    "memory");
73041 +}
73042 +
73043 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
73044 +                            struct task_struct *tsk)
73045 +{
73046 +       unsigned cpu = smp_processor_id();
73047 +       struct mmuext_op _op[3], *op = _op;
73048 +
73049 +       if (likely(prev != next)) {
73050 +               if (!next->context.pinned)
73051 +                       mm_pin(next);
73052 +
73053 +               /* stop flush ipis for the previous mm */
73054 +               clear_bit(cpu, &prev->cpu_vm_mask);
73055 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
73056 +               write_pda(mmu_state, TLBSTATE_OK);
73057 +               write_pda(active_mm, next);
73058 +#endif
73059 +               set_bit(cpu, &next->cpu_vm_mask);
73060 +
73061 +               /* load_cr3(next->pgd) */
73062 +               op->cmd = MMUEXT_NEW_BASEPTR;
73063 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
73064 +               op++;
73065 +
73066 +               /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
73067 +               op->cmd = MMUEXT_NEW_USER_BASEPTR;
73068 +               op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
73069 +               op++;
73070 +               
73071 +               if (unlikely(next->context.ldt != prev->context.ldt)) {
73072 +                       /* load_LDT_nolock(&next->context, cpu) */
73073 +                       op->cmd = MMUEXT_SET_LDT;
73074 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
73075 +                       op->arg2.nr_ents     = next->context.size;
73076 +                       op++;
73077 +               }
73078 +
73079 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
73080 +       }
73081 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
73082 +       else {
73083 +               write_pda(mmu_state, TLBSTATE_OK);
73084 +               if (read_pda(active_mm) != next)
73085 +                       out_of_line_bug();
73086 +               if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) {
73087 +                       /* We were in lazy tlb mode and leave_mm disabled 
73088 +                        * tlb flush IPI delivery. We must reload CR3
73089 +                        * to make sure to use no freed page tables.
73090 +                        */
73091 +                        load_cr3(next->pgd);
73092 +                        xen_new_user_pt(__pa(__user_pgd(next->pgd)));          
73093 +                       load_LDT_nolock(&next->context, cpu);
73094 +               }
73095 +       }
73096 +#endif
73097 +}
73098 +
73099 +#define deactivate_mm(tsk,mm)  do { \
73100 +       load_gs_index(0); \
73101 +       asm volatile("movl %0,%%fs"::"r"(0));  \
73102 +} while(0)
73103 +
73104 +#define activate_mm(prev, next) do {           \
73105 +       switch_mm((prev),(next),NULL);          \
73106 +} while (0)
73107 +
73108 +#endif
73109 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/mmu.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/mmu.h
73110 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/mmu.h    1970-01-01 01:00:00.000000000 +0100
73111 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/mmu.h       2006-04-10 00:05:52.000000000 +0200
73112 @@ -0,0 +1,33 @@
73113 +#ifndef __x86_64_MMU_H
73114 +#define __x86_64_MMU_H
73115 +
73116 +#include <linux/spinlock.h>
73117 +#include <asm/semaphore.h>
73118 +
73119 +/*
73120 + * The x86_64 doesn't have a mmu context, but
73121 + * we put the segment information here.
73122 + *
73123 + * cpu_vm_mask is used to optimize ldt flushing.
73124 + */
73125 +typedef struct { 
73126 +       void *ldt;
73127 +       rwlock_t ldtlock; 
73128 +       int size;
73129 +       struct semaphore sem; 
73130 +#ifdef CONFIG_XEN
73131 +       unsigned pinned:1;
73132 +       struct list_head unpinned;
73133 +#endif
73134 +} mm_context_t;
73135 +
73136 +#ifdef CONFIG_XEN
73137 +extern struct list_head mm_unpinned;
73138 +extern spinlock_t mm_unpinned_lock;
73139 +
73140 +/* mm/memory.c:exit_mmap hook */
73141 +extern void _arch_exit_mmap(struct mm_struct *mm);
73142 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
73143 +#endif
73144 +
73145 +#endif
73146 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/msr.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/msr.h
73147 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/msr.h    1970-01-01 01:00:00.000000000 +0100
73148 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/msr.h       2006-04-10 00:05:52.000000000 +0200
73149 @@ -0,0 +1,399 @@
73150 +#ifndef X86_64_MSR_H
73151 +#define X86_64_MSR_H 1
73152 +
73153 +#ifndef __ASSEMBLY__
73154 +/*
73155 + * Access to machine-specific registers (available on 586 and better only)
73156 + * Note: the rd* operations modify the parameters directly (without using
73157 + * pointer indirection), this allows gcc to optimize better
73158 + */
73159 +
73160 +#define rdmsr(msr,val1,val2) \
73161 +       __asm__ __volatile__("rdmsr" \
73162 +                           : "=a" (val1), "=d" (val2) \
73163 +                           : "c" (msr))
73164 +
73165 +
73166 +#define rdmsrl(msr,val) do { unsigned long a__,b__; \
73167 +       __asm__ __volatile__("rdmsr" \
73168 +                           : "=a" (a__), "=d" (b__) \
73169 +                           : "c" (msr)); \
73170 +       val = a__ | (b__<<32); \
73171 +} while(0)
73172 +
73173 +#define wrmsr(msr,val1,val2) \
73174 +     __asm__ __volatile__("wrmsr" \
73175 +                         : /* no outputs */ \
73176 +                         : "c" (msr), "a" (val1), "d" (val2))
73177 +
73178 +#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32) 
73179 +
73180 +/* wrmsr with exception handling */
73181 +#define wrmsr_safe(msr,a,b) ({ int ret__;                      \
73182 +       asm volatile("2: wrmsr ; xorl %0,%0\n"                  \
73183 +                    "1:\n\t"                                   \
73184 +                    ".section .fixup,\"ax\"\n\t"               \
73185 +                    "3:  movl %4,%0 ; jmp 1b\n\t"              \
73186 +                    ".previous\n\t"                            \
73187 +                    ".section __ex_table,\"a\"\n"              \
73188 +                    "   .align 8\n\t"                          \
73189 +                    "   .quad  2b,3b\n\t"                      \
73190 +                    ".previous"                                \
73191 +                    : "=a" (ret__)                             \
73192 +                    : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
73193 +       ret__; })
73194 +
73195 +#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
73196 +
73197 +#define rdmsr_safe(msr,a,b) \
73198 +       ({ int ret__;                                           \
73199 +         asm volatile ("1:       rdmsr\n"                      \
73200 +                      "2:\n"                                   \
73201 +                      ".section .fixup,\"ax\"\n"               \
73202 +                      "3:       movl %4,%0\n"                  \
73203 +                      " jmp 2b\n"                              \
73204 +                      ".previous\n"                            \
73205 +                      ".section __ex_table,\"a\"\n"            \
73206 +                      " .align 8\n"                            \
73207 +                      " .quad 1b,3b\n"                         \
73208 +                      ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b))\
73209 +                      :"c"(msr), "i"(-EIO), "0"(0));           \
73210 +         ret__; })             
73211 +
73212 +#define rdtsc(low,high) \
73213 +     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
73214 +
73215 +#define rdtscl(low) \
73216 +     __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
73217 +
73218 +#define rdtscll(val) do { \
73219 +     unsigned int __a,__d; \
73220 +     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
73221 +     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
73222 +} while(0)
73223 +
73224 +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
73225 +
73226 +#define rdpmc(counter,low,high) \
73227 +     __asm__ __volatile__("rdpmc" \
73228 +                         : "=a" (low), "=d" (high) \
73229 +                         : "c" (counter))
73230 +
73231 +static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
73232 +                        unsigned int *ecx, unsigned int *edx)
73233 +{
73234 +       __asm__(XEN_CPUID
73235 +               : "=a" (*eax),
73236 +                 "=b" (*ebx),
73237 +                 "=c" (*ecx),
73238 +                 "=d" (*edx)
73239 +               : "0" (op));
73240 +}
73241 +
73242 +/* Some CPUID calls want 'count' to be placed in ecx */
73243 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
73244 +               int *edx)
73245 +{
73246 +       __asm__(XEN_CPUID
73247 +               : "=a" (*eax),
73248 +                 "=b" (*ebx),
73249 +                 "=c" (*ecx),
73250 +                 "=d" (*edx)
73251 +               : "0" (op), "c" (count));
73252 +}
73253 +
73254 +/*
73255 + * CPUID functions returning a single datum
73256 + */
73257 +static inline unsigned int cpuid_eax(unsigned int op)
73258 +{
73259 +       unsigned int eax;
73260 +
73261 +       __asm__(XEN_CPUID
73262 +               : "=a" (eax)
73263 +               : "0" (op)
73264 +               : "bx", "cx", "dx");
73265 +       return eax;
73266 +}
73267 +static inline unsigned int cpuid_ebx(unsigned int op)
73268 +{
73269 +       unsigned int eax, ebx;
73270 +
73271 +       __asm__(XEN_CPUID
73272 +               : "=a" (eax), "=b" (ebx)
73273 +               : "0" (op)
73274 +               : "cx", "dx" );
73275 +       return ebx;
73276 +}
73277 +static inline unsigned int cpuid_ecx(unsigned int op)
73278 +{
73279 +       unsigned int eax, ecx;
73280 +
73281 +       __asm__(XEN_CPUID
73282 +               : "=a" (eax), "=c" (ecx)
73283 +               : "0" (op)
73284 +               : "bx", "dx" );
73285 +       return ecx;
73286 +}
73287 +static inline unsigned int cpuid_edx(unsigned int op)
73288 +{
73289 +       unsigned int eax, edx;
73290 +
73291 +       __asm__(XEN_CPUID
73292 +               : "=a" (eax), "=d" (edx)
73293 +               : "0" (op)
73294 +               : "bx", "cx");
73295 +       return edx;
73296 +}
73297 +
73298 +#define MSR_IA32_UCODE_WRITE           0x79
73299 +#define MSR_IA32_UCODE_REV             0x8b
73300 +
73301 +
73302 +#endif
73303 +
73304 +/* AMD/K8 specific MSRs */ 
73305 +#define MSR_EFER 0xc0000080            /* extended feature register */
73306 +#define MSR_STAR 0xc0000081            /* legacy mode SYSCALL target */
73307 +#define MSR_LSTAR 0xc0000082           /* long mode SYSCALL target */
73308 +#define MSR_CSTAR 0xc0000083           /* compatibility mode SYSCALL target */
73309 +#define MSR_SYSCALL_MASK 0xc0000084    /* EFLAGS mask for syscall */
73310 +#define MSR_FS_BASE 0xc0000100         /* 64bit GS base */
73311 +#define MSR_GS_BASE 0xc0000101         /* 64bit FS base */
73312 +#define MSR_KERNEL_GS_BASE  0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ 
73313 +/* EFER bits: */ 
73314 +#define _EFER_SCE 0  /* SYSCALL/SYSRET */
73315 +#define _EFER_LME 8  /* Long mode enable */
73316 +#define _EFER_LMA 10 /* Long mode active (read-only) */
73317 +#define _EFER_NX 11  /* No execute enable */
73318 +
73319 +#define EFER_SCE (1<<_EFER_SCE)
73320 +#define EFER_LME (1<<_EFER_LME)
73321 +#define EFER_LMA (1<<_EFER_LMA)
73322 +#define EFER_NX (1<<_EFER_NX)
73323 +
73324 +/* Intel MSRs. Some also available on other CPUs */
73325 +#define MSR_IA32_TSC           0x10
73326 +#define MSR_IA32_PLATFORM_ID   0x17
73327 +
73328 +#define MSR_IA32_PERFCTR0      0xc1
73329 +#define MSR_IA32_PERFCTR1      0xc2
73330 +
73331 +#define MSR_MTRRcap            0x0fe
73332 +#define MSR_IA32_BBL_CR_CTL        0x119
73333 +
73334 +#define MSR_IA32_SYSENTER_CS   0x174
73335 +#define MSR_IA32_SYSENTER_ESP  0x175
73336 +#define MSR_IA32_SYSENTER_EIP  0x176
73337 +
73338 +#define MSR_IA32_MCG_CAP       0x179
73339 +#define MSR_IA32_MCG_STATUS        0x17a
73340 +#define MSR_IA32_MCG_CTL       0x17b
73341 +
73342 +#define MSR_IA32_EVNTSEL0      0x186
73343 +#define MSR_IA32_EVNTSEL1      0x187
73344 +
73345 +#define MSR_IA32_DEBUGCTLMSR       0x1d9
73346 +#define MSR_IA32_LASTBRANCHFROMIP  0x1db
73347 +#define MSR_IA32_LASTBRANCHTOIP        0x1dc
73348 +#define MSR_IA32_LASTINTFROMIP     0x1dd
73349 +#define MSR_IA32_LASTINTTOIP       0x1de
73350 +
73351 +#define MSR_MTRRfix64K_00000   0x250
73352 +#define MSR_MTRRfix16K_80000   0x258
73353 +#define MSR_MTRRfix16K_A0000   0x259
73354 +#define MSR_MTRRfix4K_C0000    0x268
73355 +#define MSR_MTRRfix4K_C8000    0x269
73356 +#define MSR_MTRRfix4K_D0000    0x26a
73357 +#define MSR_MTRRfix4K_D8000    0x26b
73358 +#define MSR_MTRRfix4K_E0000    0x26c
73359 +#define MSR_MTRRfix4K_E8000    0x26d
73360 +#define MSR_MTRRfix4K_F0000    0x26e
73361 +#define MSR_MTRRfix4K_F8000    0x26f
73362 +#define MSR_MTRRdefType                0x2ff
73363 +
73364 +#define MSR_IA32_MC0_CTL       0x400
73365 +#define MSR_IA32_MC0_STATUS        0x401
73366 +#define MSR_IA32_MC0_ADDR      0x402
73367 +#define MSR_IA32_MC0_MISC      0x403
73368 +
73369 +#define MSR_P6_PERFCTR0                        0xc1
73370 +#define MSR_P6_PERFCTR1                        0xc2
73371 +#define MSR_P6_EVNTSEL0                        0x186
73372 +#define MSR_P6_EVNTSEL1                        0x187
73373 +
73374 +/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */
73375 +#define MSR_K7_EVNTSEL0            0xC0010000
73376 +#define MSR_K7_PERFCTR0            0xC0010004
73377 +#define MSR_K7_EVNTSEL1            0xC0010001
73378 +#define MSR_K7_PERFCTR1            0xC0010005
73379 +#define MSR_K7_EVNTSEL2            0xC0010002
73380 +#define MSR_K7_PERFCTR2            0xC0010006
73381 +#define MSR_K7_EVNTSEL3            0xC0010003
73382 +#define MSR_K7_PERFCTR3            0xC0010007
73383 +#define MSR_K8_TOP_MEM1                   0xC001001A
73384 +#define MSR_K8_TOP_MEM2                   0xC001001D
73385 +#define MSR_K8_SYSCFG             0xC0010010
73386 +#define MSR_K8_HWCR               0xC0010015
73387 +
73388 +/* K6 MSRs */
73389 +#define MSR_K6_EFER                    0xC0000080
73390 +#define MSR_K6_STAR                    0xC0000081
73391 +#define MSR_K6_WHCR                    0xC0000082
73392 +#define MSR_K6_UWCCR                   0xC0000085
73393 +#define MSR_K6_PSOR                    0xC0000087
73394 +#define MSR_K6_PFIR                    0xC0000088
73395 +
73396 +/* Centaur-Hauls/IDT defined MSRs. */
73397 +#define MSR_IDT_FCR1                   0x107
73398 +#define MSR_IDT_FCR2                   0x108
73399 +#define MSR_IDT_FCR3                   0x109
73400 +#define MSR_IDT_FCR4                   0x10a
73401 +
73402 +#define MSR_IDT_MCR0                   0x110
73403 +#define MSR_IDT_MCR1                   0x111
73404 +#define MSR_IDT_MCR2                   0x112
73405 +#define MSR_IDT_MCR3                   0x113
73406 +#define MSR_IDT_MCR4                   0x114
73407 +#define MSR_IDT_MCR5                   0x115
73408 +#define MSR_IDT_MCR6                   0x116
73409 +#define MSR_IDT_MCR7                   0x117
73410 +#define MSR_IDT_MCR_CTRL               0x120
73411 +
73412 +/* VIA Cyrix defined MSRs*/
73413 +#define MSR_VIA_FCR                    0x1107
73414 +#define MSR_VIA_LONGHAUL               0x110a
73415 +#define MSR_VIA_RNG                    0x110b
73416 +#define MSR_VIA_BCR2                   0x1147
73417 +
73418 +/* Intel defined MSRs. */
73419 +#define MSR_IA32_P5_MC_ADDR            0
73420 +#define MSR_IA32_P5_MC_TYPE            1
73421 +#define MSR_IA32_PLATFORM_ID           0x17
73422 +#define MSR_IA32_EBL_CR_POWERON                0x2a
73423 +
73424 +#define MSR_IA32_APICBASE               0x1b
73425 +#define MSR_IA32_APICBASE_BSP           (1<<8)
73426 +#define MSR_IA32_APICBASE_ENABLE        (1<<11)
73427 +#define MSR_IA32_APICBASE_BASE          (0xfffff<<12)
73428 +
73429 +/* P4/Xeon+ specific */
73430 +#define MSR_IA32_MCG_EAX               0x180
73431 +#define MSR_IA32_MCG_EBX               0x181
73432 +#define MSR_IA32_MCG_ECX               0x182
73433 +#define MSR_IA32_MCG_EDX               0x183
73434 +#define MSR_IA32_MCG_ESI               0x184
73435 +#define MSR_IA32_MCG_EDI               0x185
73436 +#define MSR_IA32_MCG_EBP               0x186
73437 +#define MSR_IA32_MCG_ESP               0x187
73438 +#define MSR_IA32_MCG_EFLAGS            0x188
73439 +#define MSR_IA32_MCG_EIP               0x189
73440 +#define MSR_IA32_MCG_RESERVED          0x18A
73441 +
73442 +#define MSR_P6_EVNTSEL0                        0x186
73443 +#define MSR_P6_EVNTSEL1                        0x187
73444 +
73445 +#define MSR_IA32_PERF_STATUS           0x198
73446 +#define MSR_IA32_PERF_CTL              0x199
73447 +
73448 +#define MSR_IA32_THERM_CONTROL         0x19a
73449 +#define MSR_IA32_THERM_INTERRUPT       0x19b
73450 +#define MSR_IA32_THERM_STATUS          0x19c
73451 +#define MSR_IA32_MISC_ENABLE           0x1a0
73452 +
73453 +#define MSR_IA32_DEBUGCTLMSR           0x1d9
73454 +#define MSR_IA32_LASTBRANCHFROMIP      0x1db
73455 +#define MSR_IA32_LASTBRANCHTOIP                0x1dc
73456 +#define MSR_IA32_LASTINTFROMIP         0x1dd
73457 +#define MSR_IA32_LASTINTTOIP           0x1de
73458 +
73459 +#define MSR_IA32_MC0_CTL               0x400
73460 +#define MSR_IA32_MC0_STATUS            0x401
73461 +#define MSR_IA32_MC0_ADDR              0x402
73462 +#define MSR_IA32_MC0_MISC              0x403
73463 +
73464 +/* Pentium IV performance counter MSRs */
73465 +#define MSR_P4_BPU_PERFCTR0            0x300
73466 +#define MSR_P4_BPU_PERFCTR1            0x301
73467 +#define MSR_P4_BPU_PERFCTR2            0x302
73468 +#define MSR_P4_BPU_PERFCTR3            0x303
73469 +#define MSR_P4_MS_PERFCTR0             0x304
73470 +#define MSR_P4_MS_PERFCTR1             0x305
73471 +#define MSR_P4_MS_PERFCTR2             0x306
73472 +#define MSR_P4_MS_PERFCTR3             0x307
73473 +#define MSR_P4_FLAME_PERFCTR0          0x308
73474 +#define MSR_P4_FLAME_PERFCTR1          0x309
73475 +#define MSR_P4_FLAME_PERFCTR2          0x30a
73476 +#define MSR_P4_FLAME_PERFCTR3          0x30b
73477 +#define MSR_P4_IQ_PERFCTR0             0x30c
73478 +#define MSR_P4_IQ_PERFCTR1             0x30d
73479 +#define MSR_P4_IQ_PERFCTR2             0x30e
73480 +#define MSR_P4_IQ_PERFCTR3             0x30f
73481 +#define MSR_P4_IQ_PERFCTR4             0x310
73482 +#define MSR_P4_IQ_PERFCTR5             0x311
73483 +#define MSR_P4_BPU_CCCR0               0x360
73484 +#define MSR_P4_BPU_CCCR1               0x361
73485 +#define MSR_P4_BPU_CCCR2               0x362
73486 +#define MSR_P4_BPU_CCCR3               0x363
73487 +#define MSR_P4_MS_CCCR0                0x364
73488 +#define MSR_P4_MS_CCCR1                0x365
73489 +#define MSR_P4_MS_CCCR2                0x366
73490 +#define MSR_P4_MS_CCCR3                0x367
73491 +#define MSR_P4_FLAME_CCCR0             0x368
73492 +#define MSR_P4_FLAME_CCCR1             0x369
73493 +#define MSR_P4_FLAME_CCCR2             0x36a
73494 +#define MSR_P4_FLAME_CCCR3             0x36b
73495 +#define MSR_P4_IQ_CCCR0                0x36c
73496 +#define MSR_P4_IQ_CCCR1                0x36d
73497 +#define MSR_P4_IQ_CCCR2                0x36e
73498 +#define MSR_P4_IQ_CCCR3                0x36f
73499 +#define MSR_P4_IQ_CCCR4                0x370
73500 +#define MSR_P4_IQ_CCCR5                0x371
73501 +#define MSR_P4_ALF_ESCR0               0x3ca
73502 +#define MSR_P4_ALF_ESCR1               0x3cb
73503 +#define MSR_P4_BPU_ESCR0               0x3b2
73504 +#define MSR_P4_BPU_ESCR1               0x3b3
73505 +#define MSR_P4_BSU_ESCR0               0x3a0
73506 +#define MSR_P4_BSU_ESCR1               0x3a1
73507 +#define MSR_P4_CRU_ESCR0               0x3b8
73508 +#define MSR_P4_CRU_ESCR1               0x3b9
73509 +#define MSR_P4_CRU_ESCR2               0x3cc
73510 +#define MSR_P4_CRU_ESCR3               0x3cd
73511 +#define MSR_P4_CRU_ESCR4               0x3e0
73512 +#define MSR_P4_CRU_ESCR5               0x3e1
73513 +#define MSR_P4_DAC_ESCR0               0x3a8
73514 +#define MSR_P4_DAC_ESCR1               0x3a9
73515 +#define MSR_P4_FIRM_ESCR0              0x3a4
73516 +#define MSR_P4_FIRM_ESCR1              0x3a5
73517 +#define MSR_P4_FLAME_ESCR0             0x3a6
73518 +#define MSR_P4_FLAME_ESCR1             0x3a7
73519 +#define MSR_P4_FSB_ESCR0               0x3a2
73520 +#define MSR_P4_FSB_ESCR1               0x3a3
73521 +#define MSR_P4_IQ_ESCR0                0x3ba
73522 +#define MSR_P4_IQ_ESCR1                0x3bb
73523 +#define MSR_P4_IS_ESCR0                0x3b4
73524 +#define MSR_P4_IS_ESCR1                0x3b5
73525 +#define MSR_P4_ITLB_ESCR0              0x3b6
73526 +#define MSR_P4_ITLB_ESCR1              0x3b7
73527 +#define MSR_P4_IX_ESCR0                0x3c8
73528 +#define MSR_P4_IX_ESCR1                0x3c9
73529 +#define MSR_P4_MOB_ESCR0               0x3aa
73530 +#define MSR_P4_MOB_ESCR1               0x3ab
73531 +#define MSR_P4_MS_ESCR0                0x3c0
73532 +#define MSR_P4_MS_ESCR1                0x3c1
73533 +#define MSR_P4_PMH_ESCR0               0x3ac
73534 +#define MSR_P4_PMH_ESCR1               0x3ad
73535 +#define MSR_P4_RAT_ESCR0               0x3bc
73536 +#define MSR_P4_RAT_ESCR1               0x3bd
73537 +#define MSR_P4_SAAT_ESCR0              0x3ae
73538 +#define MSR_P4_SAAT_ESCR1              0x3af
73539 +#define MSR_P4_SSU_ESCR0               0x3be
73540 +#define MSR_P4_SSU_ESCR1               0x3bf    /* guess: not defined in manual */
73541 +#define MSR_P4_TBPU_ESCR0              0x3c2
73542 +#define MSR_P4_TBPU_ESCR1              0x3c3
73543 +#define MSR_P4_TC_ESCR0                0x3c4
73544 +#define MSR_P4_TC_ESCR1                0x3c5
73545 +#define MSR_P4_U2L_ESCR0               0x3b0
73546 +#define MSR_P4_U2L_ESCR1               0x3b1
73547 +
73548 +#endif
73549 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/nmi.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/nmi.h
73550 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/nmi.h    1970-01-01 01:00:00.000000000 +0100
73551 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/nmi.h       2006-04-10 00:05:52.000000000 +0200
73552 @@ -0,0 +1,75 @@
73553 +/*
73554 + *  linux/include/asm-i386/nmi.h
73555 + */
73556 +#ifndef ASM_NMI_H
73557 +#define ASM_NMI_H
73558 +
73559 +#include <linux/pm.h>
73560 +
73561 +#include <xen/interface/nmi.h>
73562 +
73563 +struct pt_regs;
73564
73565 +typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
73566
73567 +/** 
73568 + * set_nmi_callback
73569 + *
73570 + * Set a handler for an NMI. Only one handler may be
73571 + * set. Return 1 if the NMI was handled.
73572 + */
73573 +void set_nmi_callback(nmi_callback_t callback);
73574
73575 +/** 
73576 + * unset_nmi_callback
73577 + *
73578 + * Remove the handler previously set.
73579 + */
73580 +void unset_nmi_callback(void);
73581
73582 +#ifdef CONFIG_PM
73583
73584 +/** Replace the PM callback routine for NMI. */
73585 +struct pm_dev * set_nmi_pm_callback(pm_callback callback);
73586 +
73587 +/** Unset the PM callback routine back to the default. */
73588 +void unset_nmi_pm_callback(struct pm_dev * dev);
73589 +
73590 +#else
73591 +
73592 +static inline struct pm_dev * set_nmi_pm_callback(pm_callback callback)
73593 +{
73594 +       return 0;
73595 +} 
73596
73597 +static inline void unset_nmi_pm_callback(struct pm_dev * dev)
73598 +{
73599 +}
73600 +
73601 +#endif /* CONFIG_PM */
73602
73603 +extern void default_do_nmi(struct pt_regs *);
73604 +extern void die_nmi(char *str, struct pt_regs *regs);
73605 +
73606 +static inline unsigned char get_nmi_reason(void)
73607 +{
73608 +        shared_info_t *s = HYPERVISOR_shared_info;
73609 +        unsigned char reason = 0;
73610 +
73611 +        /* construct a value which looks like it came from
73612 +         * port 0x61.
73613 +         */
73614 +        if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
73615 +                reason |= 0x40;
73616 +        if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
73617 +                reason |= 0x80;
73618 +
73619 +        return reason;
73620 +}
73621 +
73622 +extern int panic_on_timeout;
73623 +extern int unknown_nmi_panic;
73624 +
73625 +extern int check_nmi_watchdog(void);
73626
73627 +#endif /* ASM_NMI_H */
73628 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/page.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/page.h
73629 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/page.h   1970-01-01 01:00:00.000000000 +0100
73630 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/page.h      2006-04-10 00:05:52.000000000 +0200
73631 @@ -0,0 +1,318 @@
73632 +#ifndef _X86_64_PAGE_H
73633 +#define _X86_64_PAGE_H
73634 +
73635 +#include <linux/config.h>
73636 +/* #include <linux/string.h> */
73637 +#ifndef __ASSEMBLY__
73638 +#include <linux/kernel.h>
73639 +#include <linux/types.h>
73640 +#include <asm/bug.h>
73641 +#include <xen/features.h>
73642 +#endif
73643 +#include <xen/interface/xen.h> 
73644 +#include <xen/foreign_page.h>
73645 +
73646 +#define arch_free_page(_page,_order)                   \
73647 +({     int foreign = PageForeign(_page);               \
73648 +       if (foreign)                                    \
73649 +               (PageForeignDestructor(_page))(_page);  \
73650 +       foreign;                                        \
73651 +})
73652 +#define HAVE_ARCH_FREE_PAGE
73653 +
73654 +#ifdef CONFIG_XEN_SCRUB_PAGES
73655 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
73656 +#else
73657 +#define scrub_pages(_p,_n) ((void)0)
73658 +#endif
73659 +
73660 +/* PAGE_SHIFT determines the page size */
73661 +#define PAGE_SHIFT     12
73662 +#ifdef __ASSEMBLY__
73663 +#define PAGE_SIZE      (0x1 << PAGE_SHIFT)
73664 +#else
73665 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
73666 +#endif
73667 +#define PAGE_MASK      (~(PAGE_SIZE-1))
73668 +#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
73669 +
73670 +#define THREAD_ORDER 1 
73671 +#define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
73672 +#define CURRENT_MASK (~(THREAD_SIZE-1))
73673 +
73674 +#define EXCEPTION_STACK_ORDER 0
73675 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
73676 +
73677 +#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER
73678 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
73679 +
73680 +#define IRQSTACK_ORDER 2
73681 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
73682 +
73683 +#define STACKFAULT_STACK 1
73684 +#define DOUBLEFAULT_STACK 2
73685 +#define NMI_STACK 3
73686 +#define DEBUG_STACK 4
73687 +#define MCE_STACK 5
73688 +#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
73689 +
73690 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
73691 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
73692 +
73693 +#define HPAGE_SHIFT PMD_SHIFT
73694 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
73695 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
73696 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
73697 +
73698 +#ifdef __KERNEL__
73699 +#ifndef __ASSEMBLY__
73700 +
73701 +extern unsigned long end_pfn;
73702 +
73703 +void clear_page(void *);
73704 +void copy_page(void *, void *);
73705 +
73706 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
73707 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
73708 +
73709 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
73710 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
73711 +
73712 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
73713 +#define INVALID_P2M_ENTRY      (~0UL)
73714 +#define FOREIGN_FRAME_BIT      (1UL<<63)
73715 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
73716 +
73717 +extern unsigned long *phys_to_machine_mapping;
73718 +
73719 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
73720 +{
73721 +       if (xen_feature(XENFEAT_auto_translated_physmap))
73722 +               return pfn;
73723 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
73724 +               ~FOREIGN_FRAME_BIT;
73725 +}
73726 +
73727 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
73728 +{
73729 +       if (xen_feature(XENFEAT_auto_translated_physmap))
73730 +               return 1;
73731 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
73732 +}
73733 +
73734 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
73735 +{
73736 +       unsigned long pfn;
73737 +
73738 +       if (xen_feature(XENFEAT_auto_translated_physmap))
73739 +               return mfn;
73740 +
73741 +       /*
73742 +        * The array access can fail (e.g., device space beyond end of RAM).
73743 +        * In such cases it doesn't matter what we return (we return garbage),
73744 +        * but we must handle the fault without crashing!
73745 +        */
73746 +       asm (
73747 +               "1:     movq %1,%0\n"
73748 +               "2:\n"
73749 +               ".section __ex_table,\"a\"\n"
73750 +               "       .align 8\n"
73751 +               "       .quad 1b,2b\n"
73752 +               ".previous"
73753 +               : "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) );
73754 +
73755 +       return pfn;
73756 +}
73757 +
73758 +/*
73759 + * We detect special mappings in one of two ways:
73760 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
73761 + *     to be outside our maximum possible pseudophys range.
73762 + *  2. If the MFN belongs to a different domain then we will certainly
73763 + *     not have MFN in our p2m table. Conversely, if the page is ours,
73764 + *     then we'll have p2m(m2p(MFN))==MFN.
73765 + * If we detect a special mapping then it doesn't have a 'struct page'.
73766 + * We force !pfn_valid() by returning an out-of-range pointer.
73767 + *
73768 + * NB. These checks require that, for any MFN that is not in our reservation,
73769 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
73770 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
73771 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
73772 + *
73773 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
73774 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
73775 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
73776 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
73777 + */
73778 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
73779 +{
73780 +       unsigned long pfn = mfn_to_pfn(mfn);
73781 +       if ((pfn < end_pfn)
73782 +           && !xen_feature(XENFEAT_auto_translated_physmap)
73783 +           && (phys_to_machine_mapping[pfn] != mfn))
73784 +               return end_pfn; /* force !pfn_valid() */
73785 +       return pfn;
73786 +}
73787 +
73788 +
73789 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
73790 +{
73791 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
73792 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
73793 +               return;
73794 +       }
73795 +       phys_to_machine_mapping[pfn] = mfn;
73796 +}
73797 +
73798 +/* Definitions for machine and pseudophysical addresses. */
73799 +typedef unsigned long paddr_t;
73800 +typedef unsigned long maddr_t;
73801 +
73802 +static inline maddr_t phys_to_machine(paddr_t phys)
73803 +{
73804 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
73805 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
73806 +       return machine;
73807 +}
73808 +
73809 +static inline paddr_t machine_to_phys(maddr_t machine)
73810 +{
73811 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
73812 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
73813 +       return phys;
73814 +}
73815 +
73816 +/*
73817 + * These are used to make use of C type-checking..
73818 + */
73819 +typedef struct { unsigned long pte; } pte_t;
73820 +typedef struct { unsigned long pmd; } pmd_t;
73821 +typedef struct { unsigned long pud; } pud_t;
73822 +typedef struct { unsigned long pgd; } pgd_t;
73823 +#define PTE_MASK       PHYSICAL_PAGE_MASK
73824 +
73825 +typedef struct { unsigned long pgprot; } pgprot_t;
73826 +
73827 +#define pte_val(x)     (((x).pte & 1) ? machine_to_phys((x).pte) : \
73828 +                        (x).pte)
73829 +#define pte_val_ma(x)  ((x).pte)
73830 +
73831 +static inline unsigned long pmd_val(pmd_t x)
73832 +{
73833 +       unsigned long ret = x.pmd;
73834 +       if (ret) ret = machine_to_phys(ret);
73835 +       return ret;
73836 +}
73837 +
73838 +static inline unsigned long pud_val(pud_t x)
73839 +{
73840 +       unsigned long ret = x.pud;
73841 +       if (ret) ret = machine_to_phys(ret);
73842 +       return ret;
73843 +}
73844 +
73845 +static inline unsigned long pgd_val(pgd_t x)
73846 +{
73847 +       unsigned long ret = x.pgd;
73848 +       if (ret) ret = machine_to_phys(ret);
73849 +       return ret;
73850 +}
73851 +
73852 +#define pgprot_val(x)  ((x).pgprot)
73853 +
73854 +#define __pte_ma(x)     ((pte_t) { (x) } )
73855 +
73856 +static inline pte_t __pte(unsigned long x)
73857 +{
73858 +       if (x & 1) x = phys_to_machine(x);
73859 +       return ((pte_t) { (x) });
73860 +}
73861 +
73862 +static inline pmd_t __pmd(unsigned long x)
73863 +{
73864 +       if ((x & 1)) x = phys_to_machine(x);
73865 +       return ((pmd_t) { (x) });
73866 +}
73867 +
73868 +static inline pud_t __pud(unsigned long x)
73869 +{
73870 +       if ((x & 1)) x = phys_to_machine(x);
73871 +       return ((pud_t) { (x) });
73872 +}
73873 +
73874 +static inline pgd_t __pgd(unsigned long x)
73875 +{
73876 +       if ((x & 1)) x = phys_to_machine(x);
73877 +       return ((pgd_t) { (x) });
73878 +}
73879 +
73880 +#define __pgprot(x)    ((pgprot_t) { (x) } )
73881 +
73882 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
73883 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
73884 +#define __START_KERNEL_map     0xffffffff80000000UL
73885 +#define __PAGE_OFFSET           0xffff880000000000UL   
73886 +
73887 +#else
73888 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
73889 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
73890 +#define __START_KERNEL_map     0xffffffff80000000
73891 +#define __PAGE_OFFSET           0xffff880000000000
73892 +#endif /* !__ASSEMBLY__ */
73893 +
73894 +#undef LOAD_OFFSET
73895 +#define LOAD_OFFSET            0
73896 +
73897 +/* to align the pointer to the (next) page boundary */
73898 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
73899 +
73900 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
73901 +#define __PHYSICAL_MASK_SHIFT  46
73902 +#define __PHYSICAL_MASK                ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
73903 +#define __VIRTUAL_MASK_SHIFT   48
73904 +#define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
73905 +
73906 +#define KERNEL_TEXT_SIZE  (40UL*1024*1024)
73907 +#define KERNEL_TEXT_START 0xffffffff80000000UL 
73908 +
73909 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
73910 +
73911 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
73912 +   Otherwise you risk miscompilation. */ 
73913 +#define __pa(x)                        (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
73914 +/* __pa_symbol should be used for C visible symbols.
73915 +   This seems to be the official gcc blessed way to do such arithmetic. */ 
73916 +#define __pa_symbol(x)         \
73917 +       ({unsigned long v;  \
73918 +         asm("" : "=r" (v) : "0" (x)); \
73919 +         __pa(v); })
73920 +
73921 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
73922 +#define __boot_va(x)           __va(x)
73923 +#define __boot_pa(x)           __pa(x)
73924 +#ifdef CONFIG_FLATMEM
73925 +#define pfn_to_page(pfn)       (mem_map + (pfn))
73926 +#define page_to_pfn(page)      ((unsigned long)((page) - mem_map))
73927 +#define pfn_valid(pfn)         ((pfn) < end_pfn)
73928 +#endif
73929 +
73930 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
73931 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
73932 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
73933 +
73934 +/* VIRT <-> MACHINE conversion */
73935 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
73936 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
73937 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
73938 +
73939 +#define VM_DATA_DEFAULT_FLAGS \
73940 +       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
73941 +        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
73942 +
73943 +#define __HAVE_ARCH_GATE_AREA 1        
73944 +
73945 +#endif /* __KERNEL__ */
73946 +
73947 +#include <asm-generic/page.h>
73948 +
73949 +#endif /* _X86_64_PAGE_H */
73950 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/pci.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/pci.h
73951 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/pci.h    1970-01-01 01:00:00.000000000 +0100
73952 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/pci.h       2006-04-10 00:05:52.000000000 +0200
73953 @@ -0,0 +1,174 @@
73954 +#ifndef __x8664_PCI_H
73955 +#define __x8664_PCI_H
73956 +
73957 +#include <linux/config.h>
73958 +#include <asm/io.h>
73959 +
73960 +#ifdef __KERNEL__
73961 +
73962 +#include <linux/mm.h> /* for struct page */
73963 +
73964 +/* Can be used to override the logic in pci_scan_bus for skipping
73965 +   already-configured bus numbers - to be used for buggy BIOSes
73966 +   or architectures with incomplete PCI setup by the loader */
73967 +
73968 +#ifdef CONFIG_PCI
73969 +extern unsigned int pcibios_assign_all_busses(void);
73970 +#else
73971 +#define pcibios_assign_all_busses()    0
73972 +#endif
73973 +#define pcibios_scan_all_fns(a, b)     0
73974 +
73975 +extern unsigned long pci_mem_start;
73976 +#define PCIBIOS_MIN_IO         0x1000
73977 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
73978 +
73979 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
73980 +
73981 +void pcibios_config_init(void);
73982 +struct pci_bus * pcibios_scan_root(int bus);
73983 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
73984 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
73985 +
73986 +void pcibios_set_master(struct pci_dev *dev);
73987 +void pcibios_penalize_isa_irq(int irq, int active);
73988 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
73989 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
73990 +
73991 +#include <linux/types.h>
73992 +#include <linux/slab.h>
73993 +#include <asm/scatterlist.h>
73994 +#include <linux/string.h>
73995 +#include <asm/page.h>
73996 +#include <linux/dma-mapping.h> /* for have_iommu */
73997 +
73998 +extern int iommu_setup(char *opt);
73999 +
74000 +/* The PCI address space does equal the physical memory
74001 + * address space.  The networking and block device layers use
74002 + * this boolean for bounce buffer decisions
74003 + *
74004 + * On AMD64 it mostly equals, but we set it to zero if a hardware
74005 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
74006 + */
74007 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
74008 +
74009 +#ifdef CONFIG_GART_IOMMU
74010 +
74011 +/*
74012 + * x86-64 always supports DAC, but sometimes it is useful to force
74013 + * devices through the IOMMU to get automatic sg list merging.
74014 + * Optional right now.
74015 + */
74016 +extern int iommu_sac_force;
74017 +#define pci_dac_dma_supported(pci_dev, mask)   (!iommu_sac_force)
74018 +
74019 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
74020 +       dma_addr_t ADDR_NAME;
74021 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
74022 +       __u32 LEN_NAME;
74023 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
74024 +       ((PTR)->ADDR_NAME)
74025 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
74026 +       (((PTR)->ADDR_NAME) = (VAL))
74027 +#define pci_unmap_len(PTR, LEN_NAME)                   \
74028 +       ((PTR)->LEN_NAME)
74029 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
74030 +       (((PTR)->LEN_NAME) = (VAL))
74031 +
74032 +#elif defined(CONFIG_SWIOTLB)
74033 +
74034 +#define pci_dac_dma_supported(pci_dev, mask)    1
74035 +
74036 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
74037 +       dma_addr_t ADDR_NAME;
74038 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
74039 +       __u32 LEN_NAME;
74040 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
74041 +       ((PTR)->ADDR_NAME)
74042 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
74043 +       (((PTR)->ADDR_NAME) = (VAL))
74044 +#define pci_unmap_len(PTR, LEN_NAME)                   \
74045 +       ((PTR)->LEN_NAME)
74046 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
74047 +       (((PTR)->LEN_NAME) = (VAL))
74048 +
74049 +#else
74050 +/* No IOMMU */
74051 +
74052 +#define pci_dac_dma_supported(pci_dev, mask)    1
74053 +
74054 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
74055 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
74056 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
74057 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
74058 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
74059 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
74060 +
74061 +#endif
74062 +
74063 +#include <asm-generic/pci-dma-compat.h>
74064 +
74065 +static inline dma64_addr_t
74066 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
74067 +{
74068 +       return ((dma64_addr_t) page_to_phys(page) +
74069 +               (dma64_addr_t) offset);
74070 +}
74071 +
74072 +static inline struct page *
74073 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
74074 +{
74075 +       return virt_to_page(__va(dma_addr));    
74076 +}
74077 +
74078 +static inline unsigned long
74079 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
74080 +{
74081 +       return (dma_addr & ~PAGE_MASK);
74082 +}
74083 +
74084 +static inline void
74085 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
74086 +{
74087 +}
74088 +
74089 +static inline void
74090 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
74091 +{
74092 +       flush_write_buffers();
74093 +}
74094 +
74095 +#ifdef CONFIG_PCI
74096 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
74097 +                                       enum pci_dma_burst_strategy *strat,
74098 +                                       unsigned long *strategy_parameter)
74099 +{
74100 +       *strat = PCI_DMA_BURST_INFINITY;
74101 +       *strategy_parameter = ~0UL;
74102 +}
74103 +#endif
74104 +
74105 +#define HAVE_PCI_MMAP
74106 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
74107 +                              enum pci_mmap_state mmap_state, int write_combine);
74108 +
74109 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
74110 +{
74111 +}
74112 +
74113 +#endif /* __KERNEL__ */
74114 +
74115 +/* generic pci stuff */
74116 +#ifdef CONFIG_PCI
74117 +#include <asm-generic/pci.h>
74118 +#endif
74119 +
74120 +/* On Xen we have to scan all functions since Xen hides bridges from
74121 + * us.  If a bridge is at fn=0 and that slot has a multifunction
74122 + * device, we won't find the additional devices without scanning all
74123 + * functions. */
74124 +#undef pcibios_scan_all_fns
74125 +#define pcibios_scan_all_fns(a, b)     1
74126 +
74127 +#endif /* __x8664_PCI_H */
74128 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/pgalloc.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/pgalloc.h
74129 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/pgalloc.h        1970-01-01 01:00:00.000000000 +0100
74130 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/pgalloc.h   2006-04-10 00:05:52.000000000 +0200
74131 @@ -0,0 +1,198 @@
74132 +#ifndef _X86_64_PGALLOC_H
74133 +#define _X86_64_PGALLOC_H
74134 +
74135 +#include <asm/fixmap.h>
74136 +#include <asm/pda.h>
74137 +#include <linux/threads.h>
74138 +#include <linux/mm.h>
74139 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
74140 +
74141 +#include <xen/features.h>
74142 +void make_page_readonly(void *va, unsigned int feature);
74143 +void make_page_writable(void *va, unsigned int feature);
74144 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
74145 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
74146 +
74147 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
74148 +
74149 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
74150 +{
74151 +       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
74152 +}
74153 +
74154 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
74155 +{
74156 +       if (unlikely((mm)->context.pinned)) {
74157 +               BUG_ON(HYPERVISOR_update_va_mapping(
74158 +                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
74159 +                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
74160 +               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
74161 +       } else {
74162 +               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
74163 +       }
74164 +}
74165 +
74166 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
74167 +{
74168 +       if (unlikely((mm)->context.pinned)) {
74169 +               BUG_ON(HYPERVISOR_update_va_mapping(
74170 +                              (unsigned long)pmd,
74171 +                              pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, 
74172 +                                      PAGE_KERNEL_RO), 0));
74173 +               set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
74174 +       } else {
74175 +               *(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
74176 +       }
74177 +}
74178 +
74179 +/*
74180 + * We need to use the batch mode here, but pgd_pupulate() won't be
74181 + * be called frequently.
74182 + */
74183 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
74184 +{
74185 +       if (unlikely((mm)->context.pinned)) {
74186 +               BUG_ON(HYPERVISOR_update_va_mapping(
74187 +                              (unsigned long)pud,
74188 +                              pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, 
74189 +                                      PAGE_KERNEL_RO), 0));
74190 +               set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
74191 +               set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
74192 +       } else {
74193 +               *(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
74194 +               *(__user_pgd(pgd)) = *(pgd);
74195 +       }
74196 +}
74197 +
74198 +static inline void pmd_free(pmd_t *pmd)
74199 +{
74200 +       pte_t *ptep = virt_to_ptep(pmd);
74201 +
74202 +       if (!pte_write(*ptep)) {
74203 +               BUG_ON(HYPERVISOR_update_va_mapping(
74204 +                       (unsigned long)pmd,
74205 +                       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
74206 +                       0));
74207 +       }
74208 +       free_page((unsigned long)pmd);
74209 +}
74210 +
74211 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
74212 +{
74213 +        pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
74214 +        return pmd;
74215 +}
74216 +
74217 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
74218 +{
74219 +        pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
74220 +        return pud;
74221 +}
74222 +
74223 +static inline void pud_free(pud_t *pud)
74224 +{
74225 +       pte_t *ptep = virt_to_ptep(pud);
74226 +
74227 +       if (!pte_write(*ptep)) {
74228 +               BUG_ON(HYPERVISOR_update_va_mapping(
74229 +                       (unsigned long)pud,
74230 +                       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
74231 +                       0));
74232 +       }
74233 +       free_page((unsigned long)pud);
74234 +}
74235 +
74236 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
74237 +{
74238 +        /*
74239 +         * We allocate two contiguous pages for kernel and user.
74240 +         */
74241 +        unsigned boundary;
74242 +       pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
74243 +
74244 +       if (!pgd)
74245 +               return NULL;
74246 +       /*
74247 +        * Copy kernel pointers in from init.
74248 +        * Could keep a freelist or slab cache of those because the kernel
74249 +        * part never changes.
74250 +        */
74251 +       boundary = pgd_index(__PAGE_OFFSET);
74252 +       memset(pgd, 0, boundary * sizeof(pgd_t));
74253 +       memcpy(pgd + boundary,
74254 +              init_level4_pgt + boundary,
74255 +              (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
74256 +
74257 +       memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
74258 +        /*
74259 +         * Set level3_user_pgt for vsyscall area
74260 +         */
74261 +       set_pgd(__user_pgd(pgd) + pgd_index(VSYSCALL_START), 
74262 +                mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
74263 +       return pgd;
74264 +}
74265 +
74266 +static inline void pgd_free(pgd_t *pgd)
74267 +{
74268 +       pte_t *ptep = virt_to_ptep(pgd);
74269 +
74270 +       if (!pte_write(*ptep)) {
74271 +               xen_pgd_unpin(__pa(pgd));
74272 +               BUG_ON(HYPERVISOR_update_va_mapping(
74273 +                              (unsigned long)pgd,
74274 +                              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
74275 +                              0));
74276 +       }
74277 +
74278 +       ptep = virt_to_ptep(__user_pgd(pgd));
74279 +
74280 +       if (!pte_write(*ptep)) {
74281 +               xen_pgd_unpin(__pa(__user_pgd(pgd)));
74282 +               BUG_ON(HYPERVISOR_update_va_mapping(
74283 +                              (unsigned long)__user_pgd(pgd),
74284 +                              pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, 
74285 +                                      PAGE_KERNEL),
74286 +                              0));
74287 +       }
74288 +
74289 +       free_pages((unsigned long)pgd, 1);
74290 +}
74291 +
74292 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
74293 +{
74294 +        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
74295 +        if (pte)
74296 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
74297 +
74298 +       return pte;
74299 +}
74300 +
74301 +static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
74302 +{
74303 +       struct page *pte;
74304 +
74305 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
74306 +       return pte;
74307 +}
74308 +
74309 +/* Should really implement gc for free page table pages. This could be
74310 +   done with a reference count in struct page. */
74311 +
74312 +static inline void pte_free_kernel(pte_t *pte)
74313 +{
74314 +       BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
74315 +        make_page_writable(pte, XENFEAT_writable_page_tables);
74316 +       free_page((unsigned long)pte); 
74317 +}
74318 +
74319 +extern void pte_free(struct page *pte);
74320 +
74321 +//#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) 
74322 +//#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
74323 +//#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
74324 +
74325 +#define __pte_free_tlb(tlb,x)   pte_free((x))
74326 +#define __pmd_free_tlb(tlb,x)   pmd_free((x))
74327 +#define __pud_free_tlb(tlb,x)   pud_free((x))
74328 +
74329 +#endif /* _X86_64_PGALLOC_H */
74330 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/pgtable.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/pgtable.h
74331 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/pgtable.h        1970-01-01 01:00:00.000000000 +0100
74332 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/pgtable.h   2006-04-10 00:05:52.000000000 +0200
74333 @@ -0,0 +1,560 @@
74334 +#ifndef _X86_64_PGTABLE_H
74335 +#define _X86_64_PGTABLE_H
74336 +
74337 +/*
74338 + * This file contains the functions and defines necessary to modify and use
74339 + * the x86-64 page table tree.
74340 + */
74341 +#include <asm/processor.h>
74342 +#include <asm/fixmap.h>
74343 +#include <asm/bitops.h>
74344 +#include <linux/threads.h>
74345 +#include <linux/sched.h>
74346 +#include <asm/pda.h>
74347 +#ifdef CONFIG_XEN
74348 +#include <asm/hypervisor.h>
74349 +
74350 +extern pud_t level3_user_pgt[512];
74351 +extern pud_t init_level4_user_pgt[];
74352 +
74353 +extern void xen_init_pt(void);
74354 +
74355 +#define virt_to_ptep(__va)                                             \
74356 +({                                                                     \
74357 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
74358 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
74359 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
74360 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
74361 +})
74362 +
74363 +#define arbitrary_virt_to_machine(__va)                                        \
74364 +({                                                                     \
74365 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
74366 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
74367 +})
74368 +#endif
74369 +
74370 +extern pud_t level3_kernel_pgt[512];
74371 +extern pud_t level3_physmem_pgt[512];
74372 +extern pud_t level3_ident_pgt[512];
74373 +extern pmd_t level2_kernel_pgt[512];
74374 +extern pgd_t init_level4_pgt[];
74375 +extern pgd_t boot_level4_pgt[];
74376 +extern unsigned long __supported_pte_mask;
74377 +
74378 +#define swapper_pg_dir init_level4_pgt
74379 +
74380 +extern int nonx_setup(char *str);
74381 +extern void paging_init(void);
74382 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
74383 +
74384 +extern unsigned long pgkern_mask;
74385 +
74386 +/*
74387 + * ZERO_PAGE is a global shared page that is always zero: used
74388 + * for zero-mapped memory areas etc..
74389 + */
74390 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
74391 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
74392 +
74393 +/*
74394 + * PGDIR_SHIFT determines what a top-level page table entry can map
74395 + */
74396 +#define PGDIR_SHIFT    39
74397 +#define PTRS_PER_PGD   512
74398 +
74399 +/*
74400 + * 3rd level page
74401 + */
74402 +#define PUD_SHIFT      30
74403 +#define PTRS_PER_PUD   512
74404 +
74405 +/*
74406 + * PMD_SHIFT determines the size of the area a middle-level
74407 + * page table can map
74408 + */
74409 +#define PMD_SHIFT      21
74410 +#define PTRS_PER_PMD   512
74411 +
74412 +/*
74413 + * entries per page directory level
74414 + */
74415 +#define PTRS_PER_PTE   512
74416 +
74417 +#define pte_ERROR(e) \
74418 +       printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
74419 +#define pmd_ERROR(e) \
74420 +       printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
74421 +#define pud_ERROR(e) \
74422 +       printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
74423 +#define pgd_ERROR(e) \
74424 +       printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
74425 +
74426 +#define pgd_none(x)    (!pgd_val(x))
74427 +#define pud_none(x)    (!pud_val(x))
74428 +
74429 +#define set_pte_batched(pteptr, pteval) \
74430 +       queue_l1_entry_update(pteptr, (pteval))
74431 +
74432 +extern inline int pud_present(pud_t pud)       { return !pud_none(pud); }
74433 +
74434 +static inline void set_pte(pte_t *dst, pte_t val)
74435 +{
74436 +       *dst = val;
74437 +}
74438 +
74439 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
74440 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
74441 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
74442 +
74443 +static inline void pud_clear (pud_t * pud)
74444 +{
74445 +       set_pud(pud, __pud(0));
74446 +}
74447 +
74448 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
74449 +
74450 +static inline void pgd_clear (pgd_t * pgd)
74451 +{
74452 +        set_pgd(pgd, __pgd(0));
74453 +        set_pgd(__user_pgd(pgd), __pgd(0));
74454 +}
74455 +
74456 +#define pud_page(pud) \
74457 +    ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
74458 +
74459 +/*
74460 + * A note on implementation of this atomic 'get-and-clear' operation.
74461 + * This is actually very simple because Xen Linux can only run on a single
74462 + * processor. Therefore, we cannot race other processors setting the 'accessed'
74463 + * or 'dirty' bits on a page-table entry.
74464 + * Even if pages are shared between domains, that is not a problem because
74465 + * each domain will have separate page tables, with their own versions of
74466 + * accessed & dirty state.
74467 + */
74468 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0))
74469 +
74470 +#if 0
74471 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
74472 +{
74473 +        pte_t pte = *xp;
74474 +        if (pte.pte)
74475 +                set_pte(xp, __pte_ma(0));
74476 +        return pte;
74477 +}
74478 +#endif
74479 +
74480 +struct mm_struct;
74481 +
74482 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
74483 +{
74484 +       pte_t pte;
74485 +       if (full) {
74486 +               pte = *ptep;
74487 +               *ptep = __pte(0);
74488 +       } else {
74489 +               pte = ptep_get_and_clear(mm, addr, ptep);
74490 +       }
74491 +       return pte;
74492 +}
74493 +
74494 +#define pte_same(a, b)         ((a).pte == (b).pte)
74495 +
74496 +#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
74497 +
74498 +#define PMD_SIZE       (1UL << PMD_SHIFT)
74499 +#define PMD_MASK       (~(PMD_SIZE-1))
74500 +#define PUD_SIZE       (1UL << PUD_SHIFT)
74501 +#define PUD_MASK       (~(PUD_SIZE-1))
74502 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
74503 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
74504 +
74505 +#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
74506 +#define FIRST_USER_ADDRESS     0
74507 +
74508 +#ifndef __ASSEMBLY__
74509 +#define MAXMEM          0x3fffffffffffUL
74510 +#define VMALLOC_START    0xffffc20000000000UL
74511 +#define VMALLOC_END      0xffffe1ffffffffffUL
74512 +#define MODULES_VADDR    0xffffffff88000000UL
74513 +#define MODULES_END      0xfffffffffff00000UL
74514 +#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
74515 +
74516 +#define _PAGE_BIT_PRESENT      0
74517 +#define _PAGE_BIT_RW           1
74518 +#define _PAGE_BIT_USER         2
74519 +#define _PAGE_BIT_PWT          3
74520 +#define _PAGE_BIT_PCD          4
74521 +#define _PAGE_BIT_ACCESSED     5
74522 +#define _PAGE_BIT_DIRTY                6
74523 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
74524 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
74525 +#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
74526 +
74527 +#define _PAGE_PRESENT  0x001
74528 +#define _PAGE_RW       0x002
74529 +#define _PAGE_USER     0x004
74530 +#define _PAGE_PWT      0x008
74531 +#define _PAGE_PCD      0x010
74532 +#define _PAGE_ACCESSED 0x020
74533 +#define _PAGE_DIRTY    0x040
74534 +#define _PAGE_PSE      0x080   /* 2MB page */
74535 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
74536 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
74537 +
74538 +#define _PAGE_PROTNONE 0x080   /* If not present */
74539 +#define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
74540 +
74541 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
74542 +#define _KERNPG_TABLE  _PAGE_TABLE
74543 +
74544 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
74545 +
74546 +#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
74547 +#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
74548 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
74549 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
74550 +#define PAGE_COPY PAGE_COPY_NOEXEC
74551 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
74552 +#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
74553 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
74554 +#define __PAGE_KERNEL \
74555 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
74556 +#define __PAGE_KERNEL_EXEC \
74557 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER )
74558 +#define __PAGE_KERNEL_NOCACHE \
74559 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
74560 +#define __PAGE_KERNEL_RO \
74561 +       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
74562 +#define __PAGE_KERNEL_VSYSCALL \
74563 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_USER )
74564 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
74565 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD | _PAGE_USER )
74566 +#define __PAGE_KERNEL_LARGE \
74567 +       (__PAGE_KERNEL | _PAGE_PSE | _PAGE_USER )
74568 +#define __PAGE_KERNEL_LARGE_EXEC \
74569 +       (__PAGE_KERNEL_EXEC | _PAGE_PSE | _PAGE_USER )
74570 +
74571 +
74572 +/*
74573 + * We don't support GLOBAL page in xenolinux64
74574 + */
74575 +#define MAKE_GLOBAL(x) __pgprot((x))
74576 +
74577 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
74578 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
74579 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
74580 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
74581 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
74582 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
74583 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
74584 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
74585 +
74586 +/*         xwr */
74587 +#define __P000 PAGE_NONE
74588 +#define __P001 PAGE_READONLY
74589 +#define __P010 PAGE_COPY
74590 +#define __P011 PAGE_COPY
74591 +#define __P100 PAGE_READONLY_EXEC
74592 +#define __P101 PAGE_READONLY_EXEC
74593 +#define __P110 PAGE_COPY_EXEC
74594 +#define __P111 PAGE_COPY_EXEC
74595 +
74596 +#define __S000 PAGE_NONE
74597 +#define __S001 PAGE_READONLY
74598 +#define __S010 PAGE_SHARED
74599 +#define __S011 PAGE_SHARED
74600 +#define __S100 PAGE_READONLY_EXEC
74601 +#define __S101 PAGE_READONLY_EXEC
74602 +#define __S110 PAGE_SHARED_EXEC
74603 +#define __S111 PAGE_SHARED_EXEC
74604 +
74605 +static inline unsigned long pgd_bad(pgd_t pgd)
74606 +{
74607 +       unsigned long val = pgd_val(pgd);
74608 +       val &= ~PTE_MASK;
74609 +       val &= ~(_PAGE_USER | _PAGE_DIRTY);
74610 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
74611 +}
74612 +
74613 +static inline unsigned long pud_bad(pud_t pud) 
74614 +{ 
74615 +       unsigned long val = pud_val(pud);
74616 +       val &= ~PTE_MASK; 
74617 +       val &= ~(_PAGE_USER | _PAGE_DIRTY); 
74618 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);      
74619 +} 
74620 +
74621 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
74622 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
74623 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
74624 +               set_pte((ptep), (pteval));                              \
74625 +} while (0)
74626 +
74627 +#define pte_none(x)    (!(x).pte)
74628 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
74629 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
74630 +
74631 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
74632 +
74633 +#define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
74634 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
74635 +
74636 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
74637 +
74638 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
74639 +{
74640 +       pte_t pte;
74641 +        
74642 +       (pte).pte = (pfn_to_mfn(page_nr) << PAGE_SHIFT);
74643 +       (pte).pte |= pgprot_val(pgprot);
74644 +       (pte).pte &= __supported_pte_mask;
74645 +       return pte;
74646 +}
74647 +
74648 +#define pfn_pte_ma(pfn, prot)  __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
74649 +/*
74650 + * The following only work if pte_present() is true.
74651 + * Undefined behaviour if not..
74652 + */
74653 +#define __pte_val(x)   ((x).pte)
74654 +
74655 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
74656 +static inline int pte_user(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
74657 +static inline int pte_read(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
74658 +static inline int pte_exec(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
74659 +static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
74660 +static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
74661 +static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
74662 +static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
74663 +static inline int pte_huge(pte_t pte)          { return (__pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; }
74664 +
74665 +static inline pte_t pte_rdprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
74666 +static inline pte_t pte_exprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
74667 +static inline pte_t pte_mkclean(pte_t pte)     { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
74668 +static inline pte_t pte_mkold(pte_t pte)       { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
74669 +static inline pte_t pte_wrprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_RW; return pte; }
74670 +static inline pte_t pte_mkread(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
74671 +static inline pte_t pte_mkexec(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
74672 +static inline pte_t pte_mkdirty(pte_t pte)     { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
74673 +static inline pte_t pte_mkyoung(pte_t pte)     { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
74674 +static inline pte_t pte_mkwrite(pte_t pte)     { __pte_val(pte) |= _PAGE_RW; return pte; }
74675 +static inline pte_t pte_mkhuge(pte_t pte)      { __pte_val(pte) |= __LARGE_PTE; return pte; }
74676 +
74677 +struct vm_area_struct;
74678 +
74679 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
74680 +{
74681 +       pte_t pte = *ptep;
74682 +       int ret = pte_dirty(pte);
74683 +       if (ret)
74684 +               set_pte(ptep, pte_mkclean(pte));
74685 +       return ret;
74686 +}
74687 +
74688 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
74689 +{
74690 +       pte_t pte = *ptep;
74691 +       int ret = pte_young(pte);
74692 +       if (ret)
74693 +               set_pte(ptep, pte_mkold(pte));
74694 +       return ret;
74695 +}
74696 +
74697 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
74698 +{
74699 +       pte_t pte = *ptep;
74700 +       if (pte_write(pte))
74701 +               set_pte(ptep, pte_wrprotect(pte));
74702 +}
74703 +
74704 +/*
74705 + * Macro to mark a page protection value as "uncacheable".
74706 + */
74707 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
74708 +
74709 +static inline int pmd_large(pmd_t pte) { 
74710 +       return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
74711 +}      
74712 +
74713 +
74714 +/*
74715 + * Conversion functions: convert a page and protection to a page entry,
74716 + * and a page entry and page directory to the page they refer to.
74717 + */
74718 +
74719 +/*
74720 + * Level 4 access.
74721 + * Never use these in the common code.
74722 + */
74723 +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
74724 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
74725 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
74726 +#define pgd_offset_k(address) (pgd_t *)(init_level4_pgt + pgd_index(address))
74727 +#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
74728 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
74729 +
74730 +/* PUD - Level3 access */
74731 +/* to find an entry in a page-table-directory. */
74732 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
74733 +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
74734 +static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
74735 +{ 
74736 +       return pud + pud_index(address);
74737 +} 
74738 +
74739 +/* Find correct pud via the hidden fourth level page level: */
74740 +
74741 +/* This accesses the reference page table of the boot cpu. 
74742 +   Other CPUs get synced lazily via the page fault handler. */
74743 +static inline pud_t *pud_offset_k(pgd_t *pgd, unsigned long address)
74744 +{
74745 +       return pud_offset(pgd_offset_k(address), address);
74746 +}
74747 +
74748 +/* PMD  - Level 2 access */
74749 +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
74750 +#define pmd_page(pmd)          (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
74751 +
74752 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
74753 +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
74754 +                                  pmd_index(address))
74755 +#define pmd_none(x)    (!pmd_val(x))
74756 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
74757 +   can temporarily clear it. */
74758 +#define pmd_present(x) (pmd_val(x))
74759 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
74760 +#define        pmd_bad(x)      ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
74761 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
74762 +#define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
74763 +
74764 +#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
74765 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
74766 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
74767 +
74768 +/* PTE - Level 1 access. */
74769 +
74770 +/* page, protection -> pte */
74771 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
74772 +#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
74773
74774 +/* physical address -> PTE */
74775 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
74776 +{ 
74777 +       pte_t pte;
74778 +       (pte).pte = physpage | pgprot_val(pgprot); 
74779 +       return pte; 
74780 +}
74781
74782 +/* Change flags of a PTE */
74783 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
74784 +{ 
74785 +        (pte).pte &= _PAGE_CHG_MASK;
74786 +       (pte).pte |= pgprot_val(newprot);
74787 +       (pte).pte &= __supported_pte_mask;
74788 +       return pte; 
74789 +}
74790 +
74791 +#define pte_index(address) \
74792 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
74793 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
74794 +                       pte_index(address))
74795 +
74796 +/* x86-64 always has all page tables mapped. */
74797 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
74798 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
74799 +#define pte_unmap(pte) /* NOP */
74800 +#define pte_unmap_nested(pte) /* NOP */ 
74801 +
74802 +#define update_mmu_cache(vma,address,pte) do { } while (0)
74803 +
74804 +/* We only update the dirty/accessed state if we set
74805 + * the dirty bit by hand in the kernel, since the hardware
74806 + * will do the accessed bit for us, and we don't want to
74807 + * race with other CPU's that might be updating the dirty
74808 + * bit at the same time. */
74809 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
74810 +#if 0
74811 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
74812 +       do {                                                              \
74813 +               if (__dirty) {                                            \
74814 +                       set_pte(__ptep, __entry);                         \
74815 +                       flush_tlb_page(__vma, __address);                 \
74816 +               }                                                         \
74817 +       } while (0)
74818 +#endif
74819 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
74820 +       do {                                                              \
74821 +               if (__dirty) {                                            \
74822 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
74823 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
74824 +                       } else {                                          \
74825 +                            xen_l1_entry_update((__ptep), (__entry)); \
74826 +                           flush_tlb_page((__vma), (__address));         \
74827 +                       }                                                 \
74828 +               }                                                         \
74829 +       } while (0)
74830 +
74831 +/* Encode and de-code a swap entry */
74832 +#define __swp_type(x)                  (((x).val >> 1) & 0x3f)
74833 +#define __swp_offset(x)                        ((x).val >> 8)
74834 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
74835 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val(pte) })
74836 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
74837 +
74838 +#endif /* !__ASSEMBLY__ */
74839 +
74840 +extern int kern_addr_valid(unsigned long addr); 
74841 +
74842 +#define DOMID_LOCAL (0xFFFFU)
74843 +
74844 +int direct_remap_pfn_range(struct vm_area_struct *vma,
74845 +                            unsigned long address,
74846 +                            unsigned long mfn,
74847 +                            unsigned long size,
74848 +                            pgprot_t prot,
74849 +                            domid_t  domid);
74850 +
74851 +int direct_kernel_remap_pfn_range(unsigned long address, 
74852 +                                 unsigned long mfn,
74853 +                                 unsigned long size, 
74854 +                                 pgprot_t prot,
74855 +                                 domid_t  domid);
74856 +
74857 +int create_lookup_pte_addr(struct mm_struct *mm,
74858 +                           unsigned long address,
74859 +                           uint64_t *ptep);
74860 +
74861 +int touch_pte_range(struct mm_struct *mm,
74862 +                    unsigned long address,
74863 +                    unsigned long size);
74864 +
74865 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
74866 +               direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
74867 +
74868 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
74869 +#define GET_IOSPACE(pfn)               0
74870 +#define GET_PFN(pfn)                   (pfn)
74871 +
74872 +#define HAVE_ARCH_UNMAPPED_AREA
74873 +
74874 +#define pgtable_cache_init()   do { } while (0)
74875 +#define check_pgt_cache()      do { } while (0)
74876 +
74877 +#define PAGE_AGP    PAGE_KERNEL_NOCACHE
74878 +#define HAVE_PAGE_AGP 1
74879 +
74880 +/* fs/proc/kcore.c */
74881 +#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
74882 +#define        kc_offset_to_vaddr(o) \
74883 +   (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
74884 +
74885 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
74886 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
74887 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
74888 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
74889 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
74890 +#define __HAVE_ARCH_PTE_SAME
74891 +#include <asm-generic/pgtable.h>
74892 +
74893 +#endif /* _X86_64_PGTABLE_H */
74894 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/processor.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/processor.h
74895 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/processor.h      1970-01-01 01:00:00.000000000 +0100
74896 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/processor.h 2006-04-10 00:05:52.000000000 +0200
74897 @@ -0,0 +1,493 @@
74898 +/*
74899 + * include/asm-x86_64/processor.h
74900 + *
74901 + * Copyright (C) 1994 Linus Torvalds
74902 + */
74903 +
74904 +#ifndef __ASM_X86_64_PROCESSOR_H
74905 +#define __ASM_X86_64_PROCESSOR_H
74906 +
74907 +#include <asm/segment.h>
74908 +#include <asm/page.h>
74909 +#include <asm/types.h>
74910 +#include <asm/sigcontext.h>
74911 +#include <asm/cpufeature.h>
74912 +#include <linux/config.h>
74913 +#include <linux/threads.h>
74914 +#include <asm/msr.h>
74915 +#include <asm/current.h>
74916 +#include <asm/system.h>
74917 +#include <asm/mmsegment.h>
74918 +#include <asm/percpu.h>
74919 +#include <linux/personality.h>
74920 +
74921 +#define TF_MASK                0x00000100
74922 +#define IF_MASK                0x00000200
74923 +#define IOPL_MASK      0x00003000
74924 +#define NT_MASK                0x00004000
74925 +#define VM_MASK                0x00020000
74926 +#define AC_MASK                0x00040000
74927 +#define VIF_MASK       0x00080000      /* virtual interrupt flag */
74928 +#define VIP_MASK       0x00100000      /* virtual interrupt pending */
74929 +#define ID_MASK                0x00200000
74930 +
74931 +#define desc_empty(desc) \
74932 +               (!((desc)->a | (desc)->b))
74933 +
74934 +#define desc_equal(desc1, desc2) \
74935 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
74936 +
74937 +/*
74938 + * Default implementation of macro that returns current
74939 + * instruction pointer ("program counter").
74940 + */
74941 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
74942 +
74943 +/*
74944 + *  CPU type and hardware bug flags. Kept separately for each CPU.
74945 + */
74946 +
74947 +struct cpuinfo_x86 {
74948 +       __u8    x86;            /* CPU family */
74949 +       __u8    x86_vendor;     /* CPU vendor */
74950 +       __u8    x86_model;
74951 +       __u8    x86_mask;
74952 +       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
74953 +       __u32   x86_capability[NCAPINTS];
74954 +       char    x86_vendor_id[16];
74955 +       char    x86_model_id[64];
74956 +       int     x86_cache_size;  /* in KB */
74957 +       int     x86_clflush_size;
74958 +       int     x86_cache_alignment;
74959 +       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
74960 +        __u8    x86_virt_bits, x86_phys_bits;
74961 +       __u8    x86_max_cores;  /* cpuid returned max cores value */
74962 +        __u32   x86_power;     
74963 +       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
74964 +       unsigned long loops_per_jiffy;
74965 +       __u8    apicid;
74966 +       __u8    booted_cores;   /* number of cores as seen by OS */
74967 +} ____cacheline_aligned;
74968 +
74969 +#define X86_VENDOR_INTEL 0
74970 +#define X86_VENDOR_CYRIX 1
74971 +#define X86_VENDOR_AMD 2
74972 +#define X86_VENDOR_UMC 3
74973 +#define X86_VENDOR_NEXGEN 4
74974 +#define X86_VENDOR_CENTAUR 5
74975 +#define X86_VENDOR_RISE 6
74976 +#define X86_VENDOR_TRANSMETA 7
74977 +#define X86_VENDOR_NUM 8
74978 +#define X86_VENDOR_UNKNOWN 0xff
74979 +
74980 +#ifdef CONFIG_SMP
74981 +extern struct cpuinfo_x86 cpu_data[];
74982 +#define current_cpu_data cpu_data[smp_processor_id()]
74983 +#else
74984 +#define cpu_data (&boot_cpu_data)
74985 +#define current_cpu_data boot_cpu_data
74986 +#endif
74987 +
74988 +extern char ignore_irq13;
74989 +
74990 +extern void identify_cpu(struct cpuinfo_x86 *);
74991 +extern void print_cpu_info(struct cpuinfo_x86 *);
74992 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
74993 +
74994 +/*
74995 + * EFLAGS bits
74996 + */
74997 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
74998 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
74999 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
75000 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
75001 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
75002 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
75003 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
75004 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
75005 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
75006 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
75007 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
75008 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
75009 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
75010 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
75011 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
75012 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
75013 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
75014 +
75015 +/*
75016 + * Intel CPU features in CR4
75017 + */
75018 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
75019 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
75020 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
75021 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
75022 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
75023 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
75024 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
75025 +#define X86_CR4_PGE            0x0080  /* enable global pages */
75026 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
75027 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
75028 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
75029 +
75030 +/*
75031 + * Save the cr4 feature set we're using (ie
75032 + * Pentium 4MB enable and PPro Global page
75033 + * enable), so that any CPU's that boot up
75034 + * after us can get the correct flags.
75035 + */
75036 +extern unsigned long mmu_cr4_features;
75037 +
75038 +static inline void set_in_cr4 (unsigned long mask)
75039 +{
75040 +       mmu_cr4_features |= mask;
75041 +       __asm__("movq %%cr4,%%rax\n\t"
75042 +               "orq %0,%%rax\n\t"
75043 +               "movq %%rax,%%cr4\n"
75044 +               : : "irg" (mask)
75045 +               :"ax");
75046 +}
75047 +
75048 +static inline void clear_in_cr4 (unsigned long mask)
75049 +{
75050 +       mmu_cr4_features &= ~mask;
75051 +       __asm__("movq %%cr4,%%rax\n\t"
75052 +               "andq %0,%%rax\n\t"
75053 +               "movq %%rax,%%cr4\n"
75054 +               : : "irg" (~mask)
75055 +               :"ax");
75056 +}
75057 +
75058 +
75059 +/*
75060 + * Bus types
75061 + */
75062 +#define MCA_bus 0
75063 +#define MCA_bus__is_a_macro
75064 +
75065 +/*
75066 + * User space process size. 47bits minus one guard page.
75067 + */
75068 +#define TASK_SIZE64    (0x800000000000UL - 4096)
75069 +
75070 +/* This decides where the kernel will search for a free chunk of vm
75071 + * space during mmap's.
75072 + */
75073 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
75074 +
75075 +#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
75076 +#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
75077 +
75078 +#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
75079 +
75080 +/*
75081 + * Size of io_bitmap.
75082 + */
75083 +#define IO_BITMAP_BITS  65536
75084 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
75085 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
75086 +#ifndef CONFIG_X86_NO_TSS
75087 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
75088 +#endif
75089 +#define INVALID_IO_BITMAP_OFFSET 0x8000
75090 +
75091 +struct i387_fxsave_struct {
75092 +       u16     cwd;
75093 +       u16     swd;
75094 +       u16     twd;
75095 +       u16     fop;
75096 +       u64     rip;
75097 +       u64     rdp; 
75098 +       u32     mxcsr;
75099 +       u32     mxcsr_mask;
75100 +       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
75101 +       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 128 bytes */
75102 +       u32     padding[24];
75103 +} __attribute__ ((aligned (16)));
75104 +
75105 +union i387_union {
75106 +       struct i387_fxsave_struct       fxsave;
75107 +};
75108 +
75109 +#ifndef CONFIG_X86_NO_TSS
75110 +struct tss_struct {
75111 +       u32 reserved1;
75112 +       u64 rsp0;       
75113 +       u64 rsp1;
75114 +       u64 rsp2;
75115 +       u64 reserved2;
75116 +       u64 ist[7];
75117 +       u32 reserved3;
75118 +       u32 reserved4;
75119 +       u16 reserved5;
75120 +       u16 io_bitmap_base;
75121 +       /*
75122 +        * The extra 1 is there because the CPU will access an
75123 +        * additional byte beyond the end of the IO permission
75124 +        * bitmap. The extra byte must be all 1 bits, and must
75125 +        * be within the limit. Thus we have:
75126 +        *
75127 +        * 128 bytes, the bitmap itself, for ports 0..0x3ff
75128 +        * 8 bytes, for an extra "long" of ~0UL
75129 +        */
75130 +       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
75131 +} __attribute__((packed)) ____cacheline_aligned;
75132 +
75133 +DECLARE_PER_CPU(struct tss_struct,init_tss);
75134 +#endif
75135 +
75136 +extern struct cpuinfo_x86 boot_cpu_data;
75137 +
75138 +#ifdef CONFIG_X86_VSMP
75139 +#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
75140 +#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
75141 +#else
75142 +#define ARCH_MIN_TASKALIGN     16
75143 +#define ARCH_MIN_MMSTRUCT_ALIGN        0
75144 +#endif
75145 +
75146 +struct thread_struct {
75147 +       unsigned long   rsp0;
75148 +       unsigned long   rsp;
75149 +       unsigned long   userrsp;        /* Copy from PDA */ 
75150 +       unsigned long   fs;
75151 +       unsigned long   gs;
75152 +       unsigned short  es, ds, fsindex, gsindex;       
75153 +/* Hardware debugging registers */
75154 +       unsigned long   debugreg0;  
75155 +       unsigned long   debugreg1;  
75156 +       unsigned long   debugreg2;  
75157 +       unsigned long   debugreg3;  
75158 +       unsigned long   debugreg6;  
75159 +       unsigned long   debugreg7;  
75160 +/* fault info */
75161 +       unsigned long   cr2, trap_no, error_code;
75162 +/* floating point info */
75163 +       union i387_union        i387  __attribute__((aligned(16)));
75164 +/* IO permissions. the bitmap could be moved into the GDT, that would make
75165 +   switch faster for a limited number of ioperm using tasks. -AK */
75166 +       int             ioperm;
75167 +       unsigned long   *io_bitmap_ptr;
75168 +       unsigned io_bitmap_max;
75169 +/* cached TLS descriptors. */
75170 +       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
75171 +       unsigned int    iopl;
75172 +} __attribute__((aligned(16)));
75173 +
75174 +#define INIT_THREAD  { \
75175 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
75176 +}
75177 +
75178 +#ifndef CONFIG_X86_NO_TSS
75179 +#define INIT_TSS  { \
75180 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
75181 +}
75182 +#endif
75183 +
75184 +#define INIT_MMAP \
75185 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
75186 +
75187 +#define start_thread(regs,new_rip,new_rsp) do { \
75188 +       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
75189 +       load_gs_index(0);                                                       \
75190 +       (regs)->rip = (new_rip);                                                 \
75191 +       (regs)->rsp = (new_rsp);                                                 \
75192 +       write_pda(oldrsp, (new_rsp));                                            \
75193 +       (regs)->cs = __USER_CS;                                                  \
75194 +       (regs)->ss = __USER_DS;                                                  \
75195 +       (regs)->eflags = 0x200;                                                  \
75196 +       set_fs(USER_DS);                                                         \
75197 +} while(0) 
75198 +
75199 +#define get_debugreg(var, register)                            \
75200 +       var = HYPERVISOR_get_debugreg(register)
75201 +#define set_debugreg(value, register)                  \
75202 +       HYPERVISOR_set_debugreg(register, value)
75203 +
75204 +struct task_struct;
75205 +struct mm_struct;
75206 +
75207 +/* Free all resources held by a thread. */
75208 +extern void release_thread(struct task_struct *);
75209 +
75210 +/* Prepare to copy thread state - unlazy all lazy status */
75211 +extern void prepare_to_copy(struct task_struct *tsk);
75212 +
75213 +/*
75214 + * create a kernel thread without removing it from tasklists
75215 + */
75216 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
75217 +
75218 +/*
75219 + * Return saved PC of a blocked thread.
75220 + * What is this good for? it will be always the scheduler or ret_from_fork.
75221 + */
75222 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
75223 +
75224 +extern unsigned long get_wchan(struct task_struct *p);
75225 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
75226 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
75227 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
75228 +
75229 +
75230 +struct microcode_header {
75231 +       unsigned int hdrver;
75232 +       unsigned int rev;
75233 +       unsigned int date;
75234 +       unsigned int sig;
75235 +       unsigned int cksum;
75236 +       unsigned int ldrver;
75237 +       unsigned int pf;
75238 +       unsigned int datasize;
75239 +       unsigned int totalsize;
75240 +       unsigned int reserved[3];
75241 +};
75242 +
75243 +struct microcode {
75244 +       struct microcode_header hdr;
75245 +       unsigned int bits[0];
75246 +};
75247 +
75248 +typedef struct microcode microcode_t;
75249 +typedef struct microcode_header microcode_header_t;
75250 +
75251 +/* microcode format is extended from prescott processors */
75252 +struct extended_signature {
75253 +       unsigned int sig;
75254 +       unsigned int pf;
75255 +       unsigned int cksum;
75256 +};
75257 +
75258 +struct extended_sigtable {
75259 +       unsigned int count;
75260 +       unsigned int cksum;
75261 +       unsigned int reserved[3];
75262 +       struct extended_signature sigs[0];
75263 +};
75264 +
75265 +/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
75266 +#define MICROCODE_IOCFREE      _IO('6',0)
75267 +
75268 +
75269 +#define ASM_NOP1 K8_NOP1
75270 +#define ASM_NOP2 K8_NOP2
75271 +#define ASM_NOP3 K8_NOP3
75272 +#define ASM_NOP4 K8_NOP4
75273 +#define ASM_NOP5 K8_NOP5
75274 +#define ASM_NOP6 K8_NOP6
75275 +#define ASM_NOP7 K8_NOP7
75276 +#define ASM_NOP8 K8_NOP8
75277 +
75278 +/* Opteron nops */
75279 +#define K8_NOP1 ".byte 0x90\n"
75280 +#define K8_NOP2        ".byte 0x66,0x90\n" 
75281 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
75282 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
75283 +#define K8_NOP5        K8_NOP3 K8_NOP2 
75284 +#define K8_NOP6        K8_NOP3 K8_NOP3
75285 +#define K8_NOP7        K8_NOP4 K8_NOP3
75286 +#define K8_NOP8        K8_NOP4 K8_NOP4
75287 +
75288 +#define ASM_NOP_MAX 8
75289 +
75290 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
75291 +static inline void rep_nop(void)
75292 +{
75293 +       __asm__ __volatile__("rep;nop": : :"memory");
75294 +}
75295 +
75296 +/* Stop speculative execution */
75297 +static inline void sync_core(void)
75298 +{ 
75299 +       int tmp;
75300 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
75301 +} 
75302 +
75303 +#define cpu_has_fpu 1
75304 +
75305 +#define ARCH_HAS_PREFETCH
75306 +static inline void prefetch(void *x) 
75307 +{ 
75308 +       asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
75309 +} 
75310 +
75311 +#define ARCH_HAS_PREFETCHW 1
75312 +static inline void prefetchw(void *x) 
75313 +{ 
75314 +       alternative_input("prefetcht0 (%1)",
75315 +                         "prefetchw (%1)",
75316 +                         X86_FEATURE_3DNOW,
75317 +                         "r" (x));
75318 +} 
75319 +
75320 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
75321 +
75322 +#define spin_lock_prefetch(x)  prefetchw(x)
75323 +
75324 +#define cpu_relax()   rep_nop()
75325 +
75326 +/*
75327 + *      NSC/Cyrix CPU configuration register indexes
75328 + */
75329 +#define CX86_CCR0 0xc0
75330 +#define CX86_CCR1 0xc1
75331 +#define CX86_CCR2 0xc2
75332 +#define CX86_CCR3 0xc3
75333 +#define CX86_CCR4 0xe8
75334 +#define CX86_CCR5 0xe9
75335 +#define CX86_CCR6 0xea
75336 +#define CX86_CCR7 0xeb
75337 +#define CX86_DIR0 0xfe
75338 +#define CX86_DIR1 0xff
75339 +#define CX86_ARR_BASE 0xc4
75340 +#define CX86_RCR_BASE 0xdc
75341 +
75342 +/*
75343 + *      NSC/Cyrix CPU indexed register access macros
75344 + */
75345 +
75346 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
75347 +
75348 +#define setCx86(reg, data) do { \
75349 +       outb((reg), 0x22); \
75350 +       outb((data), 0x23); \
75351 +} while (0)
75352 +
75353 +static inline void serialize_cpu(void)
75354 +{
75355 +       __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
75356 +}
75357 +
75358 +static inline void __monitor(const void *eax, unsigned long ecx,
75359 +               unsigned long edx)
75360 +{
75361 +       /* "monitor %eax,%ecx,%edx;" */
75362 +       asm volatile(
75363 +               ".byte 0x0f,0x01,0xc8;"
75364 +               : :"a" (eax), "c" (ecx), "d"(edx));
75365 +}
75366 +
75367 +static inline void __mwait(unsigned long eax, unsigned long ecx)
75368 +{
75369 +       /* "mwait %eax,%ecx;" */
75370 +       asm volatile(
75371 +               ".byte 0x0f,0x01,0xc9;"
75372 +               : :"a" (eax), "c" (ecx));
75373 +}
75374 +
75375 +#define stack_current() \
75376 +({                                                             \
75377 +       struct thread_info *ti;                                 \
75378 +       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
75379 +       ti->task;                                       \
75380 +})
75381 +
75382 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
75383 +
75384 +extern unsigned long boot_option_idle_override;
75385 +/* Boot loader type from the setup header */
75386 +extern int bootloader_type;
75387 +
75388 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
75389 +
75390 +#endif /* __ASM_X86_64_PROCESSOR_H */
75391 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/ptrace.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/ptrace.h
75392 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/ptrace.h 1970-01-01 01:00:00.000000000 +0100
75393 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/ptrace.h    2006-04-10 00:05:52.000000000 +0200
75394 @@ -0,0 +1,125 @@
75395 +#ifndef _X86_64_PTRACE_H
75396 +#define _X86_64_PTRACE_H
75397 +
75398 +#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) 
75399 +#define R15 0
75400 +#define R14 8
75401 +#define R13 16
75402 +#define R12 24
75403 +#define RBP 32
75404 +#define RBX 40
75405 +/* arguments: interrupts/non tracing syscalls only save upto here*/
75406 +#define R11 48
75407 +#define R10 56 
75408 +#define R9 64
75409 +#define R8 72
75410 +#define RAX 80
75411 +#define RCX 88
75412 +#define RDX 96
75413 +#define RSI 104
75414 +#define RDI 112
75415 +#define ORIG_RAX 120       /* = ERROR */ 
75416 +/* end of arguments */         
75417 +/* cpu exception frame or undefined in case of fast syscall. */
75418 +#define RIP 128
75419 +#define CS 136
75420 +#define EFLAGS 144
75421 +#define RSP 152
75422 +#define SS 160
75423 +#define ARGOFFSET R11
75424 +#endif /* __ASSEMBLY__ */
75425 +
75426 +/* top of stack page */ 
75427 +#define FRAME_SIZE 168
75428 +
75429 +#define PTRACE_OLDSETOPTIONS         21
75430 +
75431 +#ifndef __ASSEMBLY__ 
75432 +
75433 +struct pt_regs {
75434 +       unsigned long r15;
75435 +       unsigned long r14;
75436 +       unsigned long r13;
75437 +       unsigned long r12;
75438 +       unsigned long rbp;
75439 +       unsigned long rbx;
75440 +/* arguments: non interrupts/non tracing syscalls only save upto here*/
75441 +       unsigned long r11;
75442 +       unsigned long r10;      
75443 +       unsigned long r9;
75444 +       unsigned long r8;
75445 +       unsigned long rax;
75446 +       unsigned long rcx;
75447 +       unsigned long rdx;
75448 +       unsigned long rsi;
75449 +       unsigned long rdi;
75450 +       unsigned long orig_rax;
75451 +/* end of arguments */         
75452 +/* cpu exception frame or undefined */
75453 +       unsigned long rip;
75454 +       unsigned long cs;
75455 +       unsigned long eflags; 
75456 +       unsigned long rsp; 
75457 +       unsigned long ss;
75458 +/* top of stack page */ 
75459 +};
75460 +
75461 +#endif
75462 +
75463 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
75464 +#define PTRACE_GETREGS            12
75465 +#define PTRACE_SETREGS            13
75466 +#define PTRACE_GETFPREGS          14
75467 +#define PTRACE_SETFPREGS          15
75468 +#define PTRACE_GETFPXREGS         18
75469 +#define PTRACE_SETFPXREGS         19
75470 +
75471 +/* only useful for access 32bit programs */
75472 +#define PTRACE_GET_THREAD_AREA    25
75473 +#define PTRACE_SET_THREAD_AREA    26
75474 +
75475 +#define PTRACE_ARCH_PRCTL        30    /* arch_prctl for child */
75476 +
75477 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 
75478 +#define user_mode(regs) (!!((regs)->cs & 3))
75479 +#define user_mode_vm(regs) user_mode(regs)
75480 +#define instruction_pointer(regs) ((regs)->rip)
75481 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
75482 +extern unsigned long profile_pc(struct pt_regs *regs);
75483 +#else
75484 +#define profile_pc(regs) instruction_pointer(regs)
75485 +#endif
75486 +
75487 +void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
75488 +
75489 +struct task_struct;
75490 +
75491 +extern unsigned long
75492 +convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
75493 +
75494 +enum {
75495 +        EF_CF   = 0x00000001,
75496 +        EF_PF   = 0x00000004,
75497 +        EF_AF   = 0x00000010,
75498 +        EF_ZF   = 0x00000040,
75499 +        EF_SF   = 0x00000080,
75500 +        EF_TF   = 0x00000100,
75501 +        EF_IE   = 0x00000200,
75502 +        EF_DF   = 0x00000400,
75503 +        EF_OF   = 0x00000800,
75504 +        EF_IOPL = 0x00003000,
75505 +        EF_IOPL_RING0 = 0x00000000,
75506 +        EF_IOPL_RING1 = 0x00001000,
75507 +        EF_IOPL_RING2 = 0x00002000,
75508 +        EF_NT   = 0x00004000,   /* nested task */
75509 +        EF_RF   = 0x00010000,   /* resume */
75510 +        EF_VM   = 0x00020000,   /* virtual mode */
75511 +        EF_AC   = 0x00040000,   /* alignment */
75512 +        EF_VIF  = 0x00080000,   /* virtual interrupt */
75513 +        EF_VIP  = 0x00100000,   /* virtual interrupt pending */
75514 +        EF_ID   = 0x00200000,   /* id */
75515 +};
75516 +
75517 +#endif
75518 +
75519 +#endif
75520 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/smp.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/smp.h
75521 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/smp.h    1970-01-01 01:00:00.000000000 +0100
75522 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/smp.h       2006-04-10 00:05:52.000000000 +0200
75523 @@ -0,0 +1,152 @@
75524 +#ifndef __ASM_SMP_H
75525 +#define __ASM_SMP_H
75526 +
75527 +/*
75528 + * We need the APIC definitions automatically as part of 'smp.h'
75529 + */
75530 +#ifndef __ASSEMBLY__
75531 +#include <linux/config.h>
75532 +#include <linux/threads.h>
75533 +#include <linux/cpumask.h>
75534 +#include <linux/bitops.h>
75535 +extern int disable_apic;
75536 +#endif
75537 +
75538 +#ifdef CONFIG_X86_LOCAL_APIC
75539 +#ifndef __ASSEMBLY__
75540 +#include <asm/fixmap.h>
75541 +#include <asm/mpspec.h>
75542 +#ifdef CONFIG_X86_IO_APIC
75543 +#include <asm/io_apic.h>
75544 +#endif
75545 +#include <asm/apic.h>
75546 +#include <asm/thread_info.h>
75547 +#endif
75548 +#endif
75549 +
75550 +#ifdef CONFIG_SMP
75551 +#ifndef ASSEMBLY
75552 +
75553 +#include <asm/pda.h>
75554 +
75555 +struct pt_regs;
75556 +
75557 +extern cpumask_t cpu_present_mask;
75558 +extern cpumask_t cpu_possible_map;
75559 +extern cpumask_t cpu_online_map;
75560 +extern cpumask_t cpu_initialized;
75561 +
75562 +/*
75563 + * Private routines/data
75564 + */
75565
75566 +extern void smp_alloc_memory(void);
75567 +extern volatile unsigned long smp_invalidate_needed;
75568 +extern int pic_mode;
75569 +extern void lock_ipi_call_lock(void);
75570 +extern void unlock_ipi_call_lock(void);
75571 +extern int smp_num_siblings;
75572 +extern void smp_send_reschedule(int cpu);
75573 +void smp_stop_cpu(void);
75574 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
75575 +                               void *info, int retry, int wait);
75576 +
75577 +extern cpumask_t cpu_sibling_map[NR_CPUS];
75578 +extern cpumask_t cpu_core_map[NR_CPUS];
75579 +extern int phys_proc_id[NR_CPUS];
75580 +extern int cpu_core_id[NR_CPUS];
75581 +
75582 +#define SMP_TRAMPOLINE_BASE 0x6000
75583 +
75584 +/*
75585 + * On x86 all CPUs are mapped 1:1 to the APIC space.
75586 + * This simplifies scheduling and IPI sending and
75587 + * compresses data structures.
75588 + */
75589 +
75590 +static inline int num_booting_cpus(void)
75591 +{
75592 +       return cpus_weight(cpu_possible_map);
75593 +}
75594 +
75595 +#define raw_smp_processor_id() read_pda(cpunumber)
75596 +
75597 +#ifdef CONFIG_X86_LOCAL_APIC
75598 +static inline int hard_smp_processor_id(void)
75599 +{
75600 +       /* we don't want to mark this access volatile - bad code generation */
75601 +       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
75602 +}
75603 +#endif
75604 +
75605 +extern int safe_smp_processor_id(void);
75606 +extern int __cpu_disable(void);
75607 +extern void __cpu_die(unsigned int cpu);
75608 +extern void prefill_possible_map(void);
75609 +extern unsigned num_processors;
75610 +extern unsigned disabled_cpus;
75611 +
75612 +#endif /* !ASSEMBLY */
75613 +
75614 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
75615 +
75616 +#endif
75617 +
75618 +#ifndef ASSEMBLY
75619 +/*
75620 + * Some lowlevel functions might want to know about
75621 + * the real APIC ID <-> CPU # mapping.
75622 + */
75623 +extern u8 x86_cpu_to_apicid[NR_CPUS];  /* physical ID */
75624 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
75625 +extern u8 bios_cpu_apicid[];
75626 +
75627 +#ifdef CONFIG_X86_LOCAL_APIC
75628 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
75629 +{
75630 +       return cpus_addr(cpumask)[0];
75631 +}
75632 +
75633 +static inline int cpu_present_to_apicid(int mps_cpu)
75634 +{
75635 +       if (mps_cpu < NR_CPUS)
75636 +               return (int)bios_cpu_apicid[mps_cpu];
75637 +       else
75638 +               return BAD_APICID;
75639 +}
75640 +#endif
75641 +
75642 +#endif /* !ASSEMBLY */
75643 +
75644 +#ifndef CONFIG_SMP
75645 +#define stack_smp_processor_id() 0
75646 +#define safe_smp_processor_id() 0
75647 +#define cpu_logical_map(x) (x)
75648 +#else
75649 +#include <asm/thread_info.h>
75650 +#define stack_smp_processor_id() \
75651 +({                                                             \
75652 +       struct thread_info *ti;                                 \
75653 +       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
75654 +       ti->cpu;                                                \
75655 +})
75656 +#endif
75657 +
75658 +#ifndef __ASSEMBLY__
75659 +#ifdef CONFIG_X86_LOCAL_APIC
75660 +static __inline int logical_smp_processor_id(void)
75661 +{
75662 +       /* we don't want to mark this access volatile - bad code generation */
75663 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
75664 +}
75665 +#endif
75666 +#endif
75667 +
75668 +#ifdef CONFIG_SMP
75669 +#define cpu_physical_id(cpu)           x86_cpu_to_apicid[cpu]
75670 +#else
75671 +#define cpu_physical_id(cpu)           boot_cpu_id
75672 +#endif
75673 +
75674 +#endif
75675 +
75676 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/synch_bitops.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/synch_bitops.h
75677 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/synch_bitops.h   1970-01-01 01:00:00.000000000 +0100
75678 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/synch_bitops.h      2006-04-10 00:05:52.000000000 +0200
75679 @@ -0,0 +1,2 @@
75680 +
75681 +#include <asm-i386/mach-xen/asm/synch_bitops.h>
75682 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/system.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/system.h
75683 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/system.h 1970-01-01 01:00:00.000000000 +0100
75684 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/system.h    2006-04-10 00:05:52.000000000 +0200
75685 @@ -0,0 +1,434 @@
75686 +#ifndef __ASM_SYSTEM_H
75687 +#define __ASM_SYSTEM_H
75688 +
75689 +#include <linux/config.h>
75690 +#include <linux/kernel.h>
75691 +#include <asm/segment.h>
75692 +#include <asm/synch_bitops.h>
75693 +#include <asm/hypervisor.h>
75694 +#include <xen/interface/arch-x86_64.h>
75695 +
75696 +#ifdef __KERNEL__
75697 +
75698 +#ifdef CONFIG_SMP
75699 +#define __vcpu_id smp_processor_id()
75700 +#else
75701 +#define __vcpu_id 0
75702 +#endif
75703 +
75704 +#ifdef CONFIG_SMP
75705 +#define LOCK_PREFIX "lock ; "
75706 +#else
75707 +#define LOCK_PREFIX ""
75708 +#endif
75709 +
75710 +#define __STR(x) #x
75711 +#define STR(x) __STR(x)
75712 +
75713 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
75714 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
75715 +
75716 +/* frame pointer must be last for get_wchan */
75717 +#define SAVE_CONTEXT    "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
75718 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t"
75719 +
75720 +#define __EXTRA_CLOBBER  \
75721 +       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
75722 +
75723 +#define switch_to(prev,next,last) \
75724 +       asm volatile(SAVE_CONTEXT                                                   \
75725 +                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
75726 +                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
75727 +                    "call __switch_to\n\t"                                       \
75728 +                    ".globl thread_return\n"                                   \
75729 +                    "thread_return:\n\t"                                           \
75730 +                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
75731 +                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
75732 +                    LOCK "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"               \
75733 +                    "movq %%rax,%%rdi\n\t"                                       \
75734 +                    "jc   ret_from_fork\n\t"                                     \
75735 +                    RESTORE_CONTEXT                                                \
75736 +                    : "=a" (last)                                                \
75737 +                    : [next] "S" (next), [prev] "D" (prev),                      \
75738 +                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
75739 +                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
75740 +                      [tif_fork] "i" (TIF_FORK),                         \
75741 +                      [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
75742 +                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
75743 +                    : "memory", "cc" __EXTRA_CLOBBER)
75744 +    
75745 +
75746 +extern void load_gs_index(unsigned);
75747 +
75748 +/*
75749 + * Load a segment. Fall back on loading the zero
75750 + * segment if something goes wrong..
75751 + */
75752 +#define loadsegment(seg,value) \
75753 +       asm volatile("\n"                       \
75754 +               "1:\t"                          \
75755 +               "movl %k0,%%" #seg "\n"         \
75756 +               "2:\n"                          \
75757 +               ".section .fixup,\"ax\"\n"      \
75758 +               "3:\t"                          \
75759 +               "movl %1,%%" #seg "\n\t"        \
75760 +               "jmp 2b\n"                      \
75761 +               ".previous\n"                   \
75762 +               ".section __ex_table,\"a\"\n\t" \
75763 +               ".align 8\n\t"                  \
75764 +               ".quad 1b,3b\n"                 \
75765 +               ".previous"                     \
75766 +               : :"r" (value), "r" (0))
75767 +
75768 +#define set_debug(value,register) \
75769 +                __asm__("movq %0,%%db" #register  \
75770 +               : /* no output */ \
75771 +               :"r" ((unsigned long) value))
75772 +
75773 +
75774 +#ifdef __KERNEL__
75775 +struct alt_instr { 
75776 +       __u8 *instr;            /* original instruction */
75777 +       __u8 *replacement;
75778 +       __u8  cpuid;            /* cpuid bit set for replacement */
75779 +       __u8  instrlen;         /* length of original instruction */
75780 +       __u8  replacementlen;   /* length of new instruction, <= instrlen */ 
75781 +       __u8  pad[5];
75782 +}; 
75783 +#endif
75784 +
75785 +/*
75786 + * Alternative instructions for different CPU types or capabilities.
75787 + * 
75788 + * This allows to use optimized instructions even on generic binary
75789 + * kernels.
75790 + * 
75791 + * length of oldinstr must be longer or equal the length of newinstr
75792 + * It can be padded with nops as needed.
75793 + * 
75794 + * For non barrier like inlines please define new variants
75795 + * without volatile and memory clobber.
75796 + */
75797 +#define alternative(oldinstr, newinstr, feature)       \
75798 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                 \
75799 +                     ".section .altinstructions,\"a\"\n"            \
75800 +                     "  .align 8\n"                                   \
75801 +                     "  .quad 661b\n"            /* label */          \
75802 +                     "  .quad 663f\n"            /* new instruction */ \
75803 +                     "  .byte %c0\n"             /* feature bit */    \
75804 +                     "  .byte 662b-661b\n"       /* sourcelen */      \
75805 +                     "  .byte 664f-663f\n"       /* replacementlen */ \
75806 +                     ".previous\n"                                     \
75807 +                     ".section .altinstr_replacement,\"ax\"\n"         \
75808 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
75809 +                     ".previous" :: "i" (feature) : "memory")  
75810 +
75811 +/*
75812 + * Alternative inline assembly with input.
75813 + * 
75814 + * Peculiarities:
75815 + * No memory clobber here. 
75816 + * Argument numbers start with 1.
75817 + * Best is to use constraints that are fixed size (like (%1) ... "r")
75818 + * If you use variable sized constraints like "m" or "g" in the 
75819 + * replacement make sure to pad to the worst case length.
75820 + */
75821 +#define alternative_input(oldinstr, newinstr, feature, input...)       \
75822 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
75823 +                     ".section .altinstructions,\"a\"\n"               \
75824 +                     "  .align 8\n"                                    \
75825 +                     "  .quad 661b\n"            /* label */           \
75826 +                     "  .quad 663f\n"            /* new instruction */ \
75827 +                     "  .byte %c0\n"             /* feature bit */     \
75828 +                     "  .byte 662b-661b\n"       /* sourcelen */       \
75829 +                     "  .byte 664f-663f\n"       /* replacementlen */  \
75830 +                     ".previous\n"                                     \
75831 +                     ".section .altinstr_replacement,\"ax\"\n"         \
75832 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
75833 +                     ".previous" :: "i" (feature), ##input)
75834 +
75835 +/* Like alternative_input, but with a single output argument */
75836 +#define alternative_io(oldinstr, newinstr, feature, output, input...) \
75837 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
75838 +                     ".section .altinstructions,\"a\"\n"               \
75839 +                     "  .align 8\n"                                    \
75840 +                     "  .quad 661b\n"            /* label */           \
75841 +                     "  .quad 663f\n"            /* new instruction */ \
75842 +                     "  .byte %c[feat]\n"        /* feature bit */     \
75843 +                     "  .byte 662b-661b\n"       /* sourcelen */       \
75844 +                     "  .byte 664f-663f\n"       /* replacementlen */  \
75845 +                     ".previous\n"                                     \
75846 +                     ".section .altinstr_replacement,\"ax\"\n"         \
75847 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
75848 +                     ".previous" : output : [feat] "i" (feature), ##input)
75849 +
75850 +/*
75851 + * Clear and set 'TS' bit respectively
75852 + */
75853 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
75854 +
75855 +static inline unsigned long read_cr0(void)
75856 +{ 
75857 +       unsigned long cr0;
75858 +       asm volatile("movq %%cr0,%0" : "=r" (cr0));
75859 +       return cr0;
75860 +} 
75861 +
75862 +static inline void write_cr0(unsigned long val) 
75863 +{ 
75864 +       asm volatile("movq %0,%%cr0" :: "r" (val));
75865 +} 
75866 +
75867 +#define read_cr3() ({ \
75868 +       unsigned long __dummy; \
75869 +       asm("movq %%cr3,%0" : "=r" (__dummy)); \
75870 +       machine_to_phys(__dummy); \
75871 +})
75872 +
75873 +static inline unsigned long read_cr4(void)
75874 +{ 
75875 +       unsigned long cr4;
75876 +       asm("movq %%cr4,%0" : "=r" (cr4));
75877 +       return cr4;
75878 +} 
75879 +
75880 +static inline void write_cr4(unsigned long val)
75881 +{ 
75882 +       asm volatile("movq %0,%%cr4" :: "r" (val));
75883 +} 
75884 +
75885 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
75886 +
75887 +#define wbinvd() \
75888 +       __asm__ __volatile__ ("wbinvd": : :"memory");
75889 +
75890 +/*
75891 + * On SMP systems, when the scheduler does migration-cost autodetection,
75892 + * it needs a way to flush as much of the CPU's caches as possible.
75893 + */
75894 +static inline void sched_cacheflush(void)
75895 +{
75896 +       wbinvd();
75897 +}
75898 +
75899 +#endif /* __KERNEL__ */
75900 +
75901 +#define nop() __asm__ __volatile__ ("nop")
75902 +
75903 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
75904 +
75905 +#define tas(ptr) (xchg((ptr),1))
75906 +
75907 +#define __xg(x) ((volatile long *)(x))
75908 +
75909 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
75910 +{
75911 +       *ptr = val;
75912 +}
75913 +
75914 +#define _set_64bit set_64bit
75915 +
75916 +/*
75917 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
75918 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
75919 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
75920 + */
75921 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
75922 +{
75923 +       switch (size) {
75924 +               case 1:
75925 +                       __asm__ __volatile__("xchgb %b0,%1"
75926 +                               :"=q" (x)
75927 +                               :"m" (*__xg(ptr)), "0" (x)
75928 +                               :"memory");
75929 +                       break;
75930 +               case 2:
75931 +                       __asm__ __volatile__("xchgw %w0,%1"
75932 +                               :"=r" (x)
75933 +                               :"m" (*__xg(ptr)), "0" (x)
75934 +                               :"memory");
75935 +                       break;
75936 +               case 4:
75937 +                       __asm__ __volatile__("xchgl %k0,%1"
75938 +                               :"=r" (x)
75939 +                               :"m" (*__xg(ptr)), "0" (x)
75940 +                               :"memory");
75941 +                       break;
75942 +               case 8:
75943 +                       __asm__ __volatile__("xchgq %0,%1"
75944 +                               :"=r" (x)
75945 +                               :"m" (*__xg(ptr)), "0" (x)
75946 +                               :"memory");
75947 +                       break;
75948 +       }
75949 +       return x;
75950 +}
75951 +
75952 +/*
75953 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
75954 + * store NEW in MEM.  Return the initial value in MEM.  Success is
75955 + * indicated by comparing RETURN with OLD.
75956 + */
75957 +
75958 +#define __HAVE_ARCH_CMPXCHG 1
75959 +
75960 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
75961 +                                     unsigned long new, int size)
75962 +{
75963 +       unsigned long prev;
75964 +       switch (size) {
75965 +       case 1:
75966 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
75967 +                                    : "=a"(prev)
75968 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
75969 +                                    : "memory");
75970 +               return prev;
75971 +       case 2:
75972 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
75973 +                                    : "=a"(prev)
75974 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
75975 +                                    : "memory");
75976 +               return prev;
75977 +       case 4:
75978 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
75979 +                                    : "=a"(prev)
75980 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
75981 +                                    : "memory");
75982 +               return prev;
75983 +       case 8:
75984 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
75985 +                                    : "=a"(prev)
75986 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
75987 +                                    : "memory");
75988 +               return prev;
75989 +       }
75990 +       return old;
75991 +}
75992 +
75993 +#define cmpxchg(ptr,o,n)\
75994 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
75995 +                                       (unsigned long)(n),sizeof(*(ptr))))
75996 +
75997 +#ifdef CONFIG_SMP
75998 +#define smp_mb()       mb()
75999 +#define smp_rmb()      rmb()
76000 +#define smp_wmb()      wmb()
76001 +#define smp_read_barrier_depends()     do {} while(0)
76002 +#else
76003 +#define smp_mb()       barrier()
76004 +#define smp_rmb()      barrier()
76005 +#define smp_wmb()      barrier()
76006 +#define smp_read_barrier_depends()     do {} while(0)
76007 +#endif
76008 +
76009 +    
76010 +/*
76011 + * Force strict CPU ordering.
76012 + * And yes, this is required on UP too when we're talking
76013 + * to devices.
76014 + */
76015 +#define mb()   asm volatile("mfence":::"memory")
76016 +#define rmb()  asm volatile("lfence":::"memory")
76017 +
76018 +#ifdef CONFIG_UNORDERED_IO
76019 +#define wmb()  asm volatile("sfence" ::: "memory")
76020 +#else
76021 +#define wmb()  asm volatile("" ::: "memory")
76022 +#endif
76023 +#define read_barrier_depends() do {} while(0)
76024 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
76025 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
76026 +
76027 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
76028 +
76029 +
76030 +/* 
76031 + * The use of 'barrier' in the following reflects their use as local-lock
76032 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
76033 + * critical operations are executed. All critical operations must complete
76034 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
76035 + * includes these barriers, for example.
76036 + */
76037 +
76038 +#define __cli()                                                                \
76039 +do {                                                                   \
76040 +       vcpu_info_t *_vcpu;                                             \
76041 +       preempt_disable();                                              \
76042 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76043 +       _vcpu->evtchn_upcall_mask = 1;                                  \
76044 +       preempt_enable_no_resched();                                    \
76045 +       barrier();                                                      \
76046 +} while (0)
76047 +
76048 +#define __sti()                                                                \
76049 +do {                                                                   \
76050 +       vcpu_info_t *_vcpu;                                             \
76051 +       barrier();                                                      \
76052 +       preempt_disable();                                              \
76053 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76054 +       _vcpu->evtchn_upcall_mask = 0;                                  \
76055 +       barrier(); /* unmask then check (avoid races) */                \
76056 +       if ( unlikely(_vcpu->evtchn_upcall_pending) )                   \
76057 +               force_evtchn_callback();                                \
76058 +       preempt_enable();                                               \
76059 +} while (0)
76060 +
76061 +#define __save_flags(x)                                                        \
76062 +do {                                                                   \
76063 +       vcpu_info_t *_vcpu;                                             \
76064 +       preempt_disable();                                              \
76065 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76066 +       (x) = _vcpu->evtchn_upcall_mask;                                \
76067 +       preempt_enable();                                               \
76068 +} while (0)
76069 +
76070 +#define __restore_flags(x)                                             \
76071 +do {                                                                   \
76072 +       vcpu_info_t *_vcpu;                                             \
76073 +       barrier();                                                      \
76074 +       preempt_disable();                                              \
76075 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76076 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
76077 +               barrier(); /* unmask then check (avoid races) */        \
76078 +               if ( unlikely(_vcpu->evtchn_upcall_pending) )           \
76079 +                       force_evtchn_callback();                        \
76080 +               preempt_enable();                                       \
76081 +       } else                                                          \
76082 +               preempt_enable_no_resched();                            \
76083 +} while (0)
76084 +
76085 +#define __save_and_cli(x)                                              \
76086 +do {                                                                   \
76087 +       vcpu_info_t *_vcpu;                                             \
76088 +       preempt_disable();                                              \
76089 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76090 +       (x) = _vcpu->evtchn_upcall_mask;                                \
76091 +       _vcpu->evtchn_upcall_mask = 1;                                  \
76092 +       preempt_enable_no_resched();                                    \
76093 +       barrier();                                                      \
76094 +} while (0)
76095 +
76096 +#define local_irq_save(x)      __save_and_cli(x)
76097 +#define local_irq_restore(x)   __restore_flags(x)
76098 +#define local_save_flags(x)    __save_flags(x)
76099 +#define local_irq_disable()    __cli()
76100 +#define local_irq_enable()     __sti()
76101 +
76102 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
76103 +#define irqs_disabled()                                                        \
76104 +({     int ___x;                                                       \
76105 +       vcpu_info_t *_vcpu;                                             \
76106 +       preempt_disable();                                              \
76107 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76108 +       ___x = (_vcpu->evtchn_upcall_mask != 0);                        \
76109 +       preempt_enable_no_resched();                                    \
76110 +       ___x; })
76111 +
76112 +#define safe_halt()            ((void)0)
76113 +#define halt()                 ((void)0)
76114 +
76115 +void cpu_idle_wait(void);
76116 +
76117 +extern unsigned long arch_align_stack(unsigned long sp);
76118 +
76119 +#endif
76120 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/timer.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/timer.h
76121 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/timer.h  1970-01-01 01:00:00.000000000 +0100
76122 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/timer.h     2006-04-10 00:05:52.000000000 +0200
76123 @@ -0,0 +1,67 @@
76124 +#ifndef _ASMi386_TIMER_H
76125 +#define _ASMi386_TIMER_H
76126 +#include <linux/init.h>
76127 +
76128 +/**
76129 + * struct timer_ops - used to define a timer source
76130 + *
76131 + * @name: name of the timer.
76132 + * @init: Probes and initializes the timer. Takes clock= override 
76133 + *        string as an argument. Returns 0 on success, anything else
76134 + *        on failure.
76135 + * @mark_offset: called by the timer interrupt.
76136 + * @get_offset:  called by gettimeofday(). Returns the number of microseconds
76137 + *               since the last timer interupt.
76138 + * @monotonic_clock: returns the number of nanoseconds since the init of the
76139 + *                   timer.
76140 + * @delay: delays this many clock cycles.
76141 + */
76142 +struct timer_opts {
76143 +       char* name;
76144 +       void (*mark_offset)(void);
76145 +       unsigned long (*get_offset)(void);
76146 +       unsigned long long (*monotonic_clock)(void);
76147 +       void (*delay)(unsigned long);
76148 +       unsigned long (*read_timer)(void);
76149 +       int (*suspend)(pm_message_t state);
76150 +       int (*resume)(void);
76151 +};
76152 +
76153 +struct init_timer_opts {
76154 +       int (*init)(char *override);
76155 +       struct timer_opts *opts;
76156 +};
76157 +
76158 +#define TICK_SIZE (tick_nsec / 1000)
76159 +
76160 +extern struct timer_opts* __init select_timer(void);
76161 +extern void clock_fallback(void);
76162 +void setup_pit_timer(void);
76163 +
76164 +/* Modifiers for buggy PIT handling */
76165 +
76166 +extern int pit_latch_buggy;
76167 +
76168 +extern struct timer_opts *cur_timer;
76169 +extern int timer_ack;
76170 +
76171 +/* list of externed timers */
76172 +extern struct timer_opts timer_none;
76173 +extern struct timer_opts timer_pit;
76174 +extern struct init_timer_opts timer_pit_init;
76175 +extern struct init_timer_opts timer_tsc_init;
76176 +#ifdef CONFIG_X86_CYCLONE_TIMER
76177 +extern struct init_timer_opts timer_cyclone_init;
76178 +#endif
76179 +
76180 +extern unsigned long calibrate_tsc(void);
76181 +extern void init_cpu_khz(void);
76182 +#ifdef CONFIG_HPET_TIMER
76183 +extern struct init_timer_opts timer_hpet_init;
76184 +extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
76185 +#endif
76186 +
76187 +#ifdef CONFIG_X86_PM_TIMER
76188 +extern struct init_timer_opts timer_pmtmr_init;
76189 +#endif
76190 +#endif
76191 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/tlbflush.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/tlbflush.h
76192 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/tlbflush.h       1970-01-01 01:00:00.000000000 +0100
76193 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/tlbflush.h  2006-04-10 00:05:52.000000000 +0200
76194 @@ -0,0 +1,104 @@
76195 +#ifndef _X8664_TLBFLUSH_H
76196 +#define _X8664_TLBFLUSH_H
76197 +
76198 +#include <linux/config.h>
76199 +#include <linux/mm.h>
76200 +#include <asm/processor.h>
76201 +
76202 +#define __flush_tlb()  xen_tlb_flush()
76203 +
76204 +/*
76205 + * Global pages have to be flushed a bit differently. Not a real
76206 + * performance problem because this does not happen often.
76207 + */
76208 +#define __flush_tlb_global()   xen_tlb_flush()
76209 +
76210 +
76211 +extern unsigned long pgkern_mask;
76212 +
76213 +#define __flush_tlb_all() __flush_tlb_global()
76214 +
76215 +#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
76216 +
76217 +
76218 +/*
76219 + * TLB flushing:
76220 + *
76221 + *  - flush_tlb() flushes the current mm struct TLBs
76222 + *  - flush_tlb_all() flushes all processes TLBs
76223 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
76224 + *  - flush_tlb_page(vma, vmaddr) flushes one page
76225 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
76226 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
76227 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
76228 + *
76229 + * x86-64 can only flush individual pages or full VMs. For a range flush
76230 + * we always do the full VM. Might be worth trying if for a small
76231 + * range a few INVLPGs in a row are a win.
76232 + */
76233 +
76234 +#ifndef CONFIG_SMP
76235 +
76236 +#define flush_tlb() __flush_tlb()
76237 +#define flush_tlb_all() __flush_tlb_all()
76238 +#define local_flush_tlb() __flush_tlb()
76239 +
76240 +static inline void flush_tlb_mm(struct mm_struct *mm)
76241 +{
76242 +       if (mm == current->active_mm)
76243 +               __flush_tlb();
76244 +}
76245 +
76246 +static inline void flush_tlb_page(struct vm_area_struct *vma,
76247 +       unsigned long addr)
76248 +{
76249 +       if (vma->vm_mm == current->active_mm)
76250 +               __flush_tlb_one(addr);
76251 +}
76252 +
76253 +static inline void flush_tlb_range(struct vm_area_struct *vma,
76254 +       unsigned long start, unsigned long end)
76255 +{
76256 +       if (vma->vm_mm == current->active_mm)
76257 +               __flush_tlb();
76258 +}
76259 +
76260 +#else
76261 +
76262 +#include <asm/smp.h>
76263 +
76264 +#define local_flush_tlb() \
76265 +       __flush_tlb()
76266 +
76267 +extern void flush_tlb_all(void);
76268 +extern void flush_tlb_current_task(void);
76269 +extern void flush_tlb_mm(struct mm_struct *);
76270 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
76271 +
76272 +#define flush_tlb()    flush_tlb_current_task()
76273 +
76274 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
76275 +{
76276 +       flush_tlb_mm(vma->vm_mm);
76277 +}
76278 +
76279 +#define TLBSTATE_OK    1
76280 +#define TLBSTATE_LAZY  2
76281 +
76282 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
76283 +   ranges. Cost is about 42k of memory for each CPU. */
76284 +#define ARCH_FREE_PTE_NR 5350  
76285 +
76286 +#endif
76287 +
76288 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
76289 +
76290 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
76291 +                                     unsigned long start, unsigned long end)
76292 +{
76293 +       /* x86_64 does not keep any page table caches in a software TLB.
76294 +          The CPUs do in their hardware TLBs, but they are handled
76295 +          by the normal TLB flushing algorithms. */
76296 +}
76297 +
76298 +#endif /* _X8664_TLBFLUSH_H */
76299 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/vga.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/vga.h
76300 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/vga.h    1970-01-01 01:00:00.000000000 +0100
76301 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/vga.h       2006-04-10 00:05:52.000000000 +0200
76302 @@ -0,0 +1,20 @@
76303 +/*
76304 + *     Access to VGA videoram
76305 + *
76306 + *     (c) 1998 Martin Mares <mj@ucw.cz>
76307 + */
76308 +
76309 +#ifndef _LINUX_ASM_VGA_H_
76310 +#define _LINUX_ASM_VGA_H_
76311 +
76312 +/*
76313 + *     On the PC, we can just recalculate addresses and then
76314 + *     access the videoram directly without any black magic.
76315 + */
76316 +
76317 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
76318 +
76319 +#define vga_readb(x) (*(x))
76320 +#define vga_writeb(x,y) (*(y) = (x))
76321 +
76322 +#endif
76323 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/xor.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/xor.h
76324 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/asm/xor.h    1970-01-01 01:00:00.000000000 +0100
76325 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/asm/xor.h       2006-04-10 00:05:52.000000000 +0200
76326 @@ -0,0 +1,328 @@
76327 +/*
76328 + * x86-64 changes / gcc fixes from Andi Kleen. 
76329 + * Copyright 2002 Andi Kleen, SuSE Labs.
76330 + *
76331 + * This hasn't been optimized for the hammer yet, but there are likely
76332 + * no advantages to be gotten from x86-64 here anyways.
76333 + */
76334 +
76335 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
76336 +
76337 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to 
76338 +   tell it to do a clts before the register saving. */
76339 +#define XMMS_SAVE do {                         \
76340 +       preempt_disable();                      \
76341 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
76342 +               clts();                         \
76343 +       __asm__ __volatile__ (                  \
76344 +               "movups %%xmm0,(%1)     ;\n\t"  \
76345 +               "movups %%xmm1,0x10(%1) ;\n\t"  \
76346 +               "movups %%xmm2,0x20(%1) ;\n\t"  \
76347 +               "movups %%xmm3,0x30(%1) ;\n\t"  \
76348 +               : "=&r" (cr0)                   \
76349 +               : "r" (xmm_save)                \
76350 +               : "memory");                    \
76351 +} while(0)
76352 +
76353 +#define XMMS_RESTORE do {                      \
76354 +       asm volatile (                          \
76355 +               "sfence                 ;\n\t"  \
76356 +               "movups (%1),%%xmm0     ;\n\t"  \
76357 +               "movups 0x10(%1),%%xmm1 ;\n\t"  \
76358 +               "movups 0x20(%1),%%xmm2 ;\n\t"  \
76359 +               "movups 0x30(%1),%%xmm3 ;\n\t"  \
76360 +               :                               \
76361 +               : "r" (cr0), "r" (xmm_save)     \
76362 +               : "memory");                    \
76363 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
76364 +               stts();                         \
76365 +       preempt_enable();                       \
76366 +} while(0)
76367 +
76368 +#define OFFS(x)                "16*("#x")"
76369 +#define PF_OFFS(x)     "256+16*("#x")"
76370 +#define        PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
76371 +#define LD(x,y)                "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
76372 +#define ST(x,y)                "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
76373 +#define PF1(x)         "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
76374 +#define PF2(x)         "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
76375 +#define PF3(x)         "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
76376 +#define PF4(x)         "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
76377 +#define PF5(x)         "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
76378 +#define XO1(x,y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
76379 +#define XO2(x,y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
76380 +#define XO3(x,y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
76381 +#define XO4(x,y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
76382 +#define XO5(x,y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
76383 +
76384 +
76385 +static void
76386 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
76387 +{
76388 +        unsigned int lines = bytes >> 8;
76389 +       unsigned long cr0;
76390 +       xmm_store_t xmm_save[4];
76391 +
76392 +       XMMS_SAVE;
76393 +
76394 +        asm volatile (
76395 +#undef BLOCK
76396 +#define BLOCK(i) \
76397 +               LD(i,0)                                 \
76398 +                       LD(i+1,1)                       \
76399 +               PF1(i)                                  \
76400 +                               PF1(i+2)                \
76401 +                               LD(i+2,2)               \
76402 +                                       LD(i+3,3)       \
76403 +               PF0(i+4)                                \
76404 +                               PF0(i+6)                \
76405 +               XO1(i,0)                                \
76406 +                       XO1(i+1,1)                      \
76407 +                               XO1(i+2,2)              \
76408 +                                       XO1(i+3,3)      \
76409 +               ST(i,0)                                 \
76410 +                       ST(i+1,1)                       \
76411 +                               ST(i+2,2)               \
76412 +                                       ST(i+3,3)       \
76413 +
76414 +
76415 +               PF0(0)
76416 +                               PF0(2)
76417 +
76418 +       " .align 32                     ;\n"
76419 +        " 1:                            ;\n"
76420 +
76421 +               BLOCK(0)
76422 +               BLOCK(4)
76423 +               BLOCK(8)
76424 +               BLOCK(12)
76425 +
76426 +        "       addq %[inc], %[p1]           ;\n"
76427 +        "       addq %[inc], %[p2]           ;\n"
76428 +               "               decl %[cnt] ; jnz 1b"
76429 +       : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
76430 +       : [inc] "r" (256UL) 
76431 +        : "memory");
76432 +
76433 +       XMMS_RESTORE;
76434 +}
76435 +
76436 +static void
76437 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76438 +         unsigned long *p3)
76439 +{
76440 +       unsigned int lines = bytes >> 8;
76441 +       xmm_store_t xmm_save[4];
76442 +       unsigned long cr0;
76443 +
76444 +       XMMS_SAVE;
76445 +
76446 +        __asm__ __volatile__ (
76447 +#undef BLOCK
76448 +#define BLOCK(i) \
76449 +               PF1(i)                                  \
76450 +                               PF1(i+2)                \
76451 +               LD(i,0)                                 \
76452 +                       LD(i+1,1)                       \
76453 +                               LD(i+2,2)               \
76454 +                                       LD(i+3,3)       \
76455 +               PF2(i)                                  \
76456 +                               PF2(i+2)                \
76457 +               PF0(i+4)                                \
76458 +                               PF0(i+6)                \
76459 +               XO1(i,0)                                \
76460 +                       XO1(i+1,1)                      \
76461 +                               XO1(i+2,2)              \
76462 +                                       XO1(i+3,3)      \
76463 +               XO2(i,0)                                \
76464 +                       XO2(i+1,1)                      \
76465 +                               XO2(i+2,2)              \
76466 +                                       XO2(i+3,3)      \
76467 +               ST(i,0)                                 \
76468 +                       ST(i+1,1)                       \
76469 +                               ST(i+2,2)               \
76470 +                                       ST(i+3,3)       \
76471 +
76472 +
76473 +               PF0(0)
76474 +                               PF0(2)
76475 +
76476 +       " .align 32                     ;\n"
76477 +        " 1:                            ;\n"
76478 +
76479 +               BLOCK(0)
76480 +               BLOCK(4)
76481 +               BLOCK(8)
76482 +               BLOCK(12)
76483 +
76484 +        "       addq %[inc], %[p1]           ;\n"
76485 +        "       addq %[inc], %[p2]          ;\n"
76486 +        "       addq %[inc], %[p3]           ;\n"
76487 +               "               decl %[cnt] ; jnz 1b"
76488 +       : [cnt] "+r" (lines),
76489 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
76490 +       : [inc] "r" (256UL)
76491 +       : "memory"); 
76492 +       XMMS_RESTORE;
76493 +}
76494 +
76495 +static void
76496 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76497 +         unsigned long *p3, unsigned long *p4)
76498 +{
76499 +       unsigned int lines = bytes >> 8;
76500 +       xmm_store_t xmm_save[4]; 
76501 +       unsigned long cr0;
76502 +
76503 +       XMMS_SAVE;
76504 +
76505 +        __asm__ __volatile__ (
76506 +#undef BLOCK
76507 +#define BLOCK(i) \
76508 +               PF1(i)                                  \
76509 +                               PF1(i+2)                \
76510 +               LD(i,0)                                 \
76511 +                       LD(i+1,1)                       \
76512 +                               LD(i+2,2)               \
76513 +                                       LD(i+3,3)       \
76514 +               PF2(i)                                  \
76515 +                               PF2(i+2)                \
76516 +               XO1(i,0)                                \
76517 +                       XO1(i+1,1)                      \
76518 +                               XO1(i+2,2)              \
76519 +                                       XO1(i+3,3)      \
76520 +               PF3(i)                                  \
76521 +                               PF3(i+2)                \
76522 +               PF0(i+4)                                \
76523 +                               PF0(i+6)                \
76524 +               XO2(i,0)                                \
76525 +                       XO2(i+1,1)                      \
76526 +                               XO2(i+2,2)              \
76527 +                                       XO2(i+3,3)      \
76528 +               XO3(i,0)                                \
76529 +                       XO3(i+1,1)                      \
76530 +                               XO3(i+2,2)              \
76531 +                                       XO3(i+3,3)      \
76532 +               ST(i,0)                                 \
76533 +                       ST(i+1,1)                       \
76534 +                               ST(i+2,2)               \
76535 +                                       ST(i+3,3)       \
76536 +
76537 +
76538 +               PF0(0)
76539 +                               PF0(2)
76540 +
76541 +       " .align 32                     ;\n"
76542 +        " 1:                            ;\n"
76543 +
76544 +               BLOCK(0)
76545 +               BLOCK(4)
76546 +               BLOCK(8)
76547 +               BLOCK(12)
76548 +
76549 +        "       addq %[inc], %[p1]           ;\n"
76550 +        "       addq %[inc], %[p2]           ;\n"
76551 +        "       addq %[inc], %[p3]           ;\n"
76552 +        "       addq %[inc], %[p4]           ;\n"
76553 +       "       decl %[cnt] ; jnz 1b"
76554 +       : [cnt] "+c" (lines),
76555 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
76556 +       : [inc] "r" (256UL)
76557 +        : "memory" );
76558 +
76559 +       XMMS_RESTORE;
76560 +}
76561 +
76562 +static void
76563 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76564 +         unsigned long *p3, unsigned long *p4, unsigned long *p5)
76565 +{
76566 +        unsigned int lines = bytes >> 8;
76567 +       xmm_store_t xmm_save[4];
76568 +       unsigned long cr0;
76569 +
76570 +       XMMS_SAVE;
76571 +
76572 +        __asm__ __volatile__ (
76573 +#undef BLOCK
76574 +#define BLOCK(i) \
76575 +               PF1(i)                                  \
76576 +                               PF1(i+2)                \
76577 +               LD(i,0)                                 \
76578 +                       LD(i+1,1)                       \
76579 +                               LD(i+2,2)               \
76580 +                                       LD(i+3,3)       \
76581 +               PF2(i)                                  \
76582 +                               PF2(i+2)                \
76583 +               XO1(i,0)                                \
76584 +                       XO1(i+1,1)                      \
76585 +                               XO1(i+2,2)              \
76586 +                                       XO1(i+3,3)      \
76587 +               PF3(i)                                  \
76588 +                               PF3(i+2)                \
76589 +               XO2(i,0)                                \
76590 +                       XO2(i+1,1)                      \
76591 +                               XO2(i+2,2)              \
76592 +                                       XO2(i+3,3)      \
76593 +               PF4(i)                                  \
76594 +                               PF4(i+2)                \
76595 +               PF0(i+4)                                \
76596 +                               PF0(i+6)                \
76597 +               XO3(i,0)                                \
76598 +                       XO3(i+1,1)                      \
76599 +                               XO3(i+2,2)              \
76600 +                                       XO3(i+3,3)      \
76601 +               XO4(i,0)                                \
76602 +                       XO4(i+1,1)                      \
76603 +                               XO4(i+2,2)              \
76604 +                                       XO4(i+3,3)      \
76605 +               ST(i,0)                                 \
76606 +                       ST(i+1,1)                       \
76607 +                               ST(i+2,2)               \
76608 +                                       ST(i+3,3)       \
76609 +
76610 +
76611 +               PF0(0)
76612 +                               PF0(2)
76613 +
76614 +       " .align 32                     ;\n"
76615 +        " 1:                            ;\n"
76616 +
76617 +               BLOCK(0)
76618 +               BLOCK(4)
76619 +               BLOCK(8)
76620 +               BLOCK(12)
76621 +
76622 +        "       addq %[inc], %[p1]           ;\n"
76623 +        "       addq %[inc], %[p2]           ;\n"
76624 +        "       addq %[inc], %[p3]           ;\n"
76625 +        "       addq %[inc], %[p4]           ;\n"
76626 +        "       addq %[inc], %[p5]           ;\n"
76627 +       "       decl %[cnt] ; jnz 1b"
76628 +       : [cnt] "+c" (lines),
76629 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 
76630 +         [p5] "+r" (p5)
76631 +       : [inc] "r" (256UL)
76632 +       : "memory");
76633 +
76634 +       XMMS_RESTORE;
76635 +}
76636 +
76637 +static struct xor_block_template xor_block_sse = {
76638 +        .name = "generic_sse",
76639 +        .do_2 = xor_sse_2,
76640 +        .do_3 = xor_sse_3,
76641 +        .do_4 = xor_sse_4,
76642 +        .do_5 = xor_sse_5,
76643 +};
76644 +
76645 +#undef XOR_TRY_TEMPLATES
76646 +#define XOR_TRY_TEMPLATES                              \
76647 +       do {                                            \
76648 +               xor_speed(&xor_block_sse);      \
76649 +       } while (0)
76650 +
76651 +/* We force the use of the SSE xor block because it can write around L2.
76652 +   We may also be able to load into the L1 only depending on how the cpu
76653 +   deals with a load to a line that is being prefetched.  */
76654 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
76655 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/irq_vectors.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/irq_vectors.h
76656 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/irq_vectors.h        1970-01-01 01:00:00.000000000 +0100
76657 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/irq_vectors.h   2006-04-10 00:05:52.000000000 +0200
76658 @@ -0,0 +1,123 @@
76659 +/*
76660 + * This file should contain #defines for all of the interrupt vector
76661 + * numbers used by this architecture.
76662 + *
76663 + * In addition, there are some standard defines:
76664 + *
76665 + *     FIRST_EXTERNAL_VECTOR:
76666 + *             The first free place for external interrupts
76667 + *
76668 + *     SYSCALL_VECTOR:
76669 + *             The IRQ vector a syscall makes the user to kernel transition
76670 + *             under.
76671 + *
76672 + *     TIMER_IRQ:
76673 + *             The IRQ number the timer interrupt comes in at.
76674 + *
76675 + *     NR_IRQS:
76676 + *             The total number of interrupt vectors (including all the
76677 + *             architecture specific interrupts) needed.
76678 + *
76679 + */                    
76680 +#ifndef _ASM_IRQ_VECTORS_H
76681 +#define _ASM_IRQ_VECTORS_H
76682 +
76683 +/*
76684 + * IDT vectors usable for external interrupt sources start
76685 + * at 0x20:
76686 + */
76687 +#define FIRST_EXTERNAL_VECTOR  0x20
76688 +
76689 +#define SYSCALL_VECTOR         0x80
76690 +
76691 +/*
76692 + * Vectors 0x20-0x2f are used for ISA interrupts.
76693 + */
76694 +
76695 +#if 0
76696 +/*
76697 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
76698 + *
76699 + *  some of the following vectors are 'rare', they are merged
76700 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
76701 + *  TLB, reschedule and local APIC vectors are performance-critical.
76702 + *
76703 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
76704 + */
76705 +#define INVALIDATE_TLB_VECTOR  0xfd
76706 +#define RESCHEDULE_VECTOR      0xfc
76707 +#define CALL_FUNCTION_VECTOR   0xfb
76708 +
76709 +#define THERMAL_APIC_VECTOR    0xf0
76710 +/*
76711 + * Local APIC timer IRQ vector is on a different priority level,
76712 + * to work around the 'lost local interrupt if more than 2 IRQ
76713 + * sources per level' errata.
76714 + */
76715 +#define LOCAL_TIMER_VECTOR     0xef
76716 +#endif
76717 +
76718 +#define SPURIOUS_APIC_VECTOR   0xff
76719 +#define ERROR_APIC_VECTOR      0xfe
76720 +
76721 +/*
76722 + * First APIC vector available to drivers: (vectors 0x30-0xee)
76723 + * we start at 0x31 to spread out vectors evenly between priority
76724 + * levels. (0x80 is the syscall vector)
76725 + */
76726 +#define FIRST_DEVICE_VECTOR    0x31
76727 +#define FIRST_SYSTEM_VECTOR    0xef
76728 +
76729 +/*
76730 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
76731 + * Right now the APIC is mostly only used for SMP.
76732 + * 256 vectors is an architectural limit. (we can have
76733 + * more than 256 devices theoretically, but they will
76734 + * have to use shared interrupts)
76735 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
76736 + * the usable vector space is 0x20-0xff (224 vectors)
76737 + */
76738 +
76739 +#define RESCHEDULE_VECTOR      0
76740 +#define CALL_FUNCTION_VECTOR   1
76741 +#define NR_IPIS                        2
76742 +
76743 +/*
76744 + * The maximum number of vectors supported by i386 processors
76745 + * is limited to 256. For processors other than i386, NR_VECTORS
76746 + * should be changed accordingly.
76747 + */
76748 +#define NR_VECTORS 256
76749 +
76750 +#define FPU_IRQ                        13
76751 +
76752 +#define        FIRST_VM86_IRQ          3
76753 +#define LAST_VM86_IRQ          15
76754 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
76755 +
76756 +/*
76757 + * The flat IRQ space is divided into two regions:
76758 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
76759 + *     if we have physical device-access privilege. This region is at the 
76760 + *     start of the IRQ space so that existing device drivers do not need
76761 + *     to be modified to translate physical IRQ numbers into our IRQ space.
76762 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
76763 + *     are bound using the provided bind/unbind functions.
76764 + */
76765 +
76766 +#define PIRQ_BASE              0
76767 +#define NR_PIRQS               256
76768 +
76769 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
76770 +#define NR_DYNIRQS             256
76771 +
76772 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
76773 +#define NR_IRQ_VECTORS         NR_IRQS
76774 +
76775 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
76776 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
76777 +
76778 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
76779 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
76780 +
76781 +#endif /* _ASM_IRQ_VECTORS_H */
76782 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/mach_time.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/mach_time.h
76783 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/mach_time.h  1970-01-01 01:00:00.000000000 +0100
76784 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/mach_time.h     2006-04-10 00:05:52.000000000 +0200
76785 @@ -0,0 +1,122 @@
76786 +/*
76787 + *  include/asm-i386/mach-default/mach_time.h
76788 + *
76789 + *  Machine specific set RTC function for generic.
76790 + *  Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
76791 + */
76792 +#ifndef _MACH_TIME_H
76793 +#define _MACH_TIME_H
76794 +
76795 +#include <asm-i386/mc146818rtc.h>
76796 +
76797 +/* for check timing call set_rtc_mmss() 500ms     */
76798 +/* used in arch/i386/time.c::do_timer_interrupt() */
76799 +#define USEC_AFTER     500000
76800 +#define USEC_BEFORE    500000
76801 +
76802 +/*
76803 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
76804 + * called 500 ms after the second nowtime has started, because when
76805 + * nowtime is written into the registers of the CMOS clock, it will
76806 + * jump to the next second precisely 500 ms later. Check the Motorola
76807 + * MC146818A or Dallas DS12887 data sheet for details.
76808 + *
76809 + * BUG: This routine does not handle hour overflow properly; it just
76810 + *      sets the minutes. Usually you'll only notice that after reboot!
76811 + */
76812 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
76813 +{
76814 +       int retval = 0;
76815 +       int real_seconds, real_minutes, cmos_minutes;
76816 +       unsigned char save_control, save_freq_select;
76817 +
76818 +       save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
76819 +       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
76820 +
76821 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
76822 +       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
76823 +
76824 +       cmos_minutes = CMOS_READ(RTC_MINUTES);
76825 +       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
76826 +               BCD_TO_BIN(cmos_minutes);
76827 +
76828 +       /*
76829 +        * since we're only adjusting minutes and seconds,
76830 +        * don't interfere with hour overflow. This avoids
76831 +        * messing with unknown time zones but requires your
76832 +        * RTC not to be off by more than 15 minutes
76833 +        */
76834 +       real_seconds = nowtime % 60;
76835 +       real_minutes = nowtime / 60;
76836 +       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
76837 +               real_minutes += 30;             /* correct for half hour time zone */
76838 +       real_minutes %= 60;
76839 +
76840 +       if (abs(real_minutes - cmos_minutes) < 30) {
76841 +               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
76842 +                       BIN_TO_BCD(real_seconds);
76843 +                       BIN_TO_BCD(real_minutes);
76844 +               }
76845 +               CMOS_WRITE(real_seconds,RTC_SECONDS);
76846 +               CMOS_WRITE(real_minutes,RTC_MINUTES);
76847 +       } else {
76848 +               printk(KERN_WARNING
76849 +                      "set_rtc_mmss: can't update from %d to %d\n",
76850 +                      cmos_minutes, real_minutes);
76851 +               retval = -1;
76852 +       }
76853 +
76854 +       /* The following flags have to be released exactly in this order,
76855 +        * otherwise the DS12887 (popular MC146818A clone with integrated
76856 +        * battery and quartz) will not reset the oscillator and will not
76857 +        * update precisely 500 ms later. You won't find this mentioned in
76858 +        * the Dallas Semiconductor data sheets, but who believes data
76859 +        * sheets anyway ...                           -- Markus Kuhn
76860 +        */
76861 +       CMOS_WRITE(save_control, RTC_CONTROL);
76862 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
76863 +
76864 +       return retval;
76865 +}
76866 +
76867 +static inline unsigned long mach_get_cmos_time(void)
76868 +{
76869 +       unsigned int year, mon, day, hour, min, sec;
76870 +       int i;
76871 +
76872 +       /* The Linux interpretation of the CMOS clock register contents:
76873 +        * When the Update-In-Progress (UIP) flag goes from 1 to 0, the
76874 +        * RTC registers show the second which has precisely just started.
76875 +        * Let's hope other operating systems interpret the RTC the same way.
76876 +        */
76877 +       /* read RTC exactly on falling edge of update flag */
76878 +       for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */
76879 +               if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
76880 +                       break;
76881 +       for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */
76882 +               if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
76883 +                       break;
76884 +       do { /* Isn't this overkill ? UIP above should guarantee consistency */
76885 +               sec = CMOS_READ(RTC_SECONDS);
76886 +               min = CMOS_READ(RTC_MINUTES);
76887 +               hour = CMOS_READ(RTC_HOURS);
76888 +               day = CMOS_READ(RTC_DAY_OF_MONTH);
76889 +               mon = CMOS_READ(RTC_MONTH);
76890 +               year = CMOS_READ(RTC_YEAR);
76891 +       } while (sec != CMOS_READ(RTC_SECONDS));
76892 +       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
76893 +         {
76894 +           BCD_TO_BIN(sec);
76895 +           BCD_TO_BIN(min);
76896 +           BCD_TO_BIN(hour);
76897 +           BCD_TO_BIN(day);
76898 +           BCD_TO_BIN(mon);
76899 +           BCD_TO_BIN(year);
76900 +         }
76901 +       if ((year += 1900) < 1970)
76902 +               year += 100;
76903 +
76904 +       return mktime(year, mon, day, hour, min, sec);
76905 +}
76906 +
76907 +#endif /* !_MACH_TIME_H */
76908 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/mach_timer.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/mach_timer.h
76909 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/mach_timer.h 1970-01-01 01:00:00.000000000 +0100
76910 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/mach_timer.h    2006-04-10 00:05:52.000000000 +0200
76911 @@ -0,0 +1,48 @@
76912 +/*
76913 + *  include/asm-i386/mach-default/mach_timer.h
76914 + *
76915 + *  Machine specific calibrate_tsc() for generic.
76916 + *  Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
76917 + */
76918 +/* ------ Calibrate the TSC ------- 
76919 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
76920 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
76921 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
76922 + * output busy loop as low as possible. We avoid reading the CTC registers
76923 + * directly because of the awkward 8-bit access mechanism of the 82C54
76924 + * device.
76925 + */
76926 +#ifndef _MACH_TIMER_H
76927 +#define _MACH_TIMER_H
76928 +
76929 +#define CALIBRATE_LATCH        (5 * LATCH)
76930 +
76931 +static inline void mach_prepare_counter(void)
76932 +{
76933 +       /* Set the Gate high, disable speaker */
76934 +       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
76935 +
76936 +       /*
76937 +        * Now let's take care of CTC channel 2
76938 +        *
76939 +        * Set the Gate high, program CTC channel 2 for mode 0,
76940 +        * (interrupt on terminal count mode), binary count,
76941 +        * load 5 * LATCH count, (LSB and MSB) to begin countdown.
76942 +        *
76943 +        * Some devices need a delay here.
76944 +        */
76945 +       outb(0xb0, 0x43);                       /* binary, mode 0, LSB/MSB, Ch 2 */
76946 +       outb_p(CALIBRATE_LATCH & 0xff, 0x42);   /* LSB of count */
76947 +       outb_p(CALIBRATE_LATCH >> 8, 0x42);       /* MSB of count */
76948 +}
76949 +
76950 +static inline void mach_countup(unsigned long *count_p)
76951 +{
76952 +       unsigned long count = 0;
76953 +       do {
76954 +               count++;
76955 +       } while ((inb_p(0x61) & 0x20) == 0);
76956 +       *count_p = count;
76957 +}
76958 +
76959 +#endif /* !_MACH_TIMER_H */
76960 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/setup_arch_post.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/setup_arch_post.h
76961 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/setup_arch_post.h    1970-01-01 01:00:00.000000000 +0100
76962 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/setup_arch_post.h       2006-04-10 00:05:52.000000000 +0200
76963 @@ -0,0 +1,28 @@
76964 +/**
76965 + * machine_specific_* - Hooks for machine specific setup.
76966 + *
76967 + * Description:
76968 + *     This is included late in kernel/setup.c so that it can make
76969 + *     use of all of the static functions.
76970 + **/
76971 +
76972 +extern void hypervisor_callback(void);
76973 +extern void failsafe_callback(void);
76974 +extern void nmi(void);
76975 +
76976 +static void __init machine_specific_arch_setup(void)
76977 +{
76978 +#ifdef CONFIG_X86_LOCAL_APIC
76979 +       struct xennmi_callback cb;
76980 +#endif
76981 +
76982 +       HYPERVISOR_set_callbacks(
76983 +                (unsigned long) hypervisor_callback,
76984 +                (unsigned long) failsafe_callback,
76985 +                (unsigned long) system_call);
76986 +
76987 +#ifdef CONFIG_X86_LOCAL_APIC
76988 +       cb.handler_address = (unsigned long)&nmi;
76989 +       HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
76990 +#endif
76991 +}
76992 diff -Nurp ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/setup_arch_pre.h tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/setup_arch_pre.h
76993 --- ref-linux-2.6.16.9/include/asm-x86_64/mach-xen/setup_arch_pre.h     1970-01-01 01:00:00.000000000 +0100
76994 +++ tmp-linux-2.6-xen.patch/include/asm-x86_64/mach-xen/setup_arch_pre.h        2006-04-10 00:05:52.000000000 +0200
76995 @@ -0,0 +1,5 @@
76996 +/* Hook to call BIOS initialisation function */
76997 +
76998 +#define ARCH_SETUP machine_specific_arch_setup();
76999 +
77000 +static void __init machine_specific_arch_setup(void);
77001 diff -Nurp ref-linux-2.6.16.9/include/linux/gfp.h tmp-linux-2.6-xen.patch/include/linux/gfp.h
77002 --- ref-linux-2.6.16.9/include/linux/gfp.h      2006-04-19 08:10:14.000000000 +0200
77003 +++ tmp-linux-2.6-xen.patch/include/linux/gfp.h 2006-04-10 00:05:52.000000000 +0200
77004 @@ -98,7 +98,11 @@ static inline int gfp_zone(gfp_t gfp)
77005   */
77006  
77007  #ifndef HAVE_ARCH_FREE_PAGE
77008 -static inline void arch_free_page(struct page *page, int order) { }
77009 +/*
77010 + * If arch_free_page returns non-zero then the generic free_page code can
77011 + * immediately bail: the arch-specific function has done all the work.
77012 + */
77013 +static inline int arch_free_page(struct page *page, int order) { return 0; }
77014  #endif
77015  
77016  extern struct page *
77017 diff -Nurp ref-linux-2.6.16.9/include/linux/highmem.h tmp-linux-2.6-xen.patch/include/linux/highmem.h
77018 --- ref-linux-2.6.16.9/include/linux/highmem.h  2006-04-19 08:10:14.000000000 +0200
77019 +++ tmp-linux-2.6-xen.patch/include/linux/highmem.h     2006-04-10 00:05:52.000000000 +0200
77020 @@ -13,10 +13,16 @@
77021  
77022  /* declarations for linux/mm/highmem.c */
77023  unsigned int nr_free_highpages(void);
77024 +#ifdef CONFIG_XEN
77025 +void kmap_flush_unused(void);
77026 +#endif
77027  
77028  #else /* CONFIG_HIGHMEM */
77029  
77030  static inline unsigned int nr_free_highpages(void) { return 0; }
77031 +#ifdef CONFIG_XEN
77032 +static inline void kmap_flush_unused(void) { }
77033 +#endif
77034  
77035  static inline void *kmap(struct page *page)
77036  {
77037 diff -Nurp ref-linux-2.6.16.9/include/linux/mm.h tmp-linux-2.6-xen.patch/include/linux/mm.h
77038 --- ref-linux-2.6.16.9/include/linux/mm.h       2006-04-19 08:10:14.000000000 +0200
77039 +++ tmp-linux-2.6-xen.patch/include/linux/mm.h  2006-04-10 00:05:52.000000000 +0200
77040 @@ -166,6 +166,9 @@ extern unsigned int kobjsize(const void 
77041  #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
77042  #define VM_MAPPED_COPY 0x01000000      /* T if mapped copy of data (nommu mmap) */
77043  #define VM_INSERTPAGE  0x02000000      /* The vma has had "vm_insert_page()" done on it */
77044 +#ifdef CONFIG_XEN
77045 +#define VM_FOREIGN     0x04000000      /* Has pages belonging to another VM */
77046 +#endif
77047  
77048  #ifndef VM_STACK_DEFAULT_FLAGS         /* arch can override this */
77049  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
77050 @@ -229,9 +232,10 @@ struct page {
77051                 unsigned long private;          /* Mapping-private opaque data:
77052                                                  * usually used for buffer_heads
77053                                                  * if PagePrivate set; used for
77054 -                                                * swp_entry_t if PageSwapCache;
77055 +                                                * swp_entry_t if PageSwapCache.
77056 +                                                * When page is free, this
77057                                                  * indicates order in the buddy
77058 -                                                * system if PG_buddy is set.
77059 +                                                * system.
77060                                                  */
77061                 struct address_space *mapping;  /* If low bit clear, points to
77062                                                  * inode address_space, or NULL.
77063 @@ -244,6 +248,9 @@ struct page {
77064  #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
77065             spinlock_t ptl;
77066  #endif
77067 +#ifdef CONFIG_XEN
77068 +           struct list_head ballooned;
77069 +#endif
77070         };
77071         pgoff_t index;                  /* Our offset within mapping. */
77072         struct list_head lru;           /* Pageout list, eg. active_list
77073 @@ -1012,6 +1019,13 @@ struct page *follow_page(struct vm_area_
77074  #define FOLL_GET       0x04    /* do get_page on page */
77075  #define FOLL_ANON      0x08    /* give ZERO_PAGE if no pgtable */
77076  
77077 +#ifdef CONFIG_XEN
77078 +typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
77079 +                       void *data);
77080 +extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
77081 +                              unsigned long size, pte_fn_t fn, void *data);
77082 +#endif
77083 +
77084  #ifdef CONFIG_PROC_FS
77085  void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
77086  #else
77087 diff -Nurp ref-linux-2.6.16.9/include/linux/skbuff.h tmp-linux-2.6-xen.patch/include/linux/skbuff.h
77088 --- ref-linux-2.6.16.9/include/linux/skbuff.h   2006-04-19 08:10:14.000000000 +0200
77089 +++ tmp-linux-2.6-xen.patch/include/linux/skbuff.h      2006-04-10 00:05:52.000000000 +0200
77090 @@ -189,6 +189,8 @@ enum {
77091   *     @local_df: allow local fragmentation
77092   *     @cloned: Head may be cloned (check refcnt to be sure)
77093   *     @nohdr: Payload reference only, must not modify header
77094 + *     @proto_data_valid: Protocol data validated since arriving at localhost
77095 + *     @proto_csum_blank: Protocol csum must be added before leaving localhost
77096   *     @pkt_type: Packet class
77097   *     @fclone: skbuff clone status
77098   *     @ip_summed: Driver fed us an IP checksum
77099 @@ -265,7 +267,13 @@ struct sk_buff {
77100                                 nfctinfo:3;
77101         __u8                    pkt_type:3,
77102                                 fclone:2,
77103 +#ifndef CONFIG_XEN
77104                                 ipvs_property:1;
77105 +#else
77106 +                               ipvs_property:1,
77107 +                               proto_data_valid:1,
77108 +                               proto_csum_blank:1;
77109 +#endif
77110         __be16                  protocol;
77111  
77112         void                    (*destructor)(struct sk_buff *skb);
77113 @@ -321,7 +329,8 @@ static inline struct sk_buff *alloc_skb_
77114  
77115  extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
77116                                             unsigned int size,
77117 -                                           gfp_t priority);
77118 +                                           gfp_t priority,
77119 +                                           int fclone);
77120  extern void           kfree_skbmem(struct sk_buff *skb);
77121  extern struct sk_buff *skb_clone(struct sk_buff *skb,
77122                                  gfp_t priority);
77123 @@ -1051,7 +1060,7 @@ static inline struct sk_buff *__dev_allo
77124         return skb;
77125  }
77126  #else
77127 -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask);
77128 +extern struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask);
77129  #endif
77130  
77131  /**
77132 diff -Nurp ref-linux-2.6.16.9/include/xen/balloon.h tmp-linux-2.6-xen.patch/include/xen/balloon.h
77133 --- ref-linux-2.6.16.9/include/xen/balloon.h    1970-01-01 01:00:00.000000000 +0100
77134 +++ tmp-linux-2.6-xen.patch/include/xen/balloon.h       2006-04-10 00:05:52.000000000 +0200
77135 @@ -0,0 +1,73 @@
77136 +/******************************************************************************
77137 + * balloon.h
77138 + *
77139 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
77140 + *
77141 + * Copyright (c) 2003, B Dragovic
77142 + * Copyright (c) 2003-2004, M Williamson, K Fraser
77143 + * 
77144 + * This program is free software; you can redistribute it and/or
77145 + * modify it under the terms of the GNU General Public License version 2
77146 + * as published by the Free Software Foundation; or, when distributed
77147 + * separately from the Linux kernel or incorporated into other
77148 + * software packages, subject to the following license:
77149 + * 
77150 + * Permission is hereby granted, free of charge, to any person obtaining a copy
77151 + * of this source file (the "Software"), to deal in the Software without
77152 + * restriction, including without limitation the rights to use, copy, modify,
77153 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
77154 + * and to permit persons to whom the Software is furnished to do so, subject to
77155 + * the following conditions:
77156 + * 
77157 + * The above copyright notice and this permission notice shall be included in
77158 + * all copies or substantial portions of the Software.
77159 + * 
77160 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
77161 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
77162 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77163 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
77164 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
77165 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
77166 + * IN THE SOFTWARE.
77167 + */
77168 +
77169 +#ifndef __ASM_BALLOON_H__
77170 +#define __ASM_BALLOON_H__
77171 +
77172 +/*
77173 + * Inform the balloon driver that it should allow some slop for device-driver
77174 + * memory activities.
77175 + */
77176 +extern void
77177 +balloon_update_driver_allowance(
77178 +       long delta);
77179 +
77180 +/* Allocate an empty low-memory page range. */
77181 +extern struct page *
77182 +balloon_alloc_empty_page_range(
77183 +       unsigned long nr_pages);
77184 +
77185 +/* Deallocate an empty page range, adding to the balloon. */
77186 +extern void
77187 +balloon_dealloc_empty_page_range(
77188 +       struct page *page, unsigned long nr_pages);
77189 +
77190 +/*
77191 + * Prevent the balloon driver from changing the memory reservation during
77192 + * a driver critical region.
77193 + */
77194 +extern spinlock_t balloon_lock;
77195 +#define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
77196 +#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
77197 +
77198 +#endif /* __ASM_BALLOON_H__ */
77199 +
77200 +/*
77201 + * Local variables:
77202 + *  c-file-style: "linux"
77203 + *  indent-tabs-mode: t
77204 + *  c-indent-level: 8
77205 + *  c-basic-offset: 8
77206 + *  tab-width: 8
77207 + * End:
77208 + */
77209 diff -Nurp ref-linux-2.6.16.9/include/xen/driver_util.h tmp-linux-2.6-xen.patch/include/xen/driver_util.h
77210 --- ref-linux-2.6.16.9/include/xen/driver_util.h        1970-01-01 01:00:00.000000000 +0100
77211 +++ tmp-linux-2.6-xen.patch/include/xen/driver_util.h   2006-04-10 00:05:52.000000000 +0200
77212 @@ -0,0 +1,26 @@
77213 +
77214 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
77215 +#define __ASM_XEN_DRIVER_UTIL_H__
77216 +
77217 +#include <linux/config.h>
77218 +#include <linux/vmalloc.h>
77219 +
77220 +/* Allocate/destroy a 'vmalloc' VM area. */
77221 +extern struct vm_struct *alloc_vm_area(unsigned long size);
77222 +extern void free_vm_area(struct vm_struct *area);
77223 +
77224 +/* Lock an area so that PTEs are accessible in the current address space. */
77225 +extern void lock_vm_area(struct vm_struct *area);
77226 +extern void unlock_vm_area(struct vm_struct *area);
77227 +
77228 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
77229 +
77230 +/*
77231 + * Local variables:
77232 + *  c-file-style: "linux"
77233 + *  indent-tabs-mode: t
77234 + *  c-indent-level: 8
77235 + *  c-basic-offset: 8
77236 + *  tab-width: 8
77237 + * End:
77238 + */
77239 diff -Nurp ref-linux-2.6.16.9/include/xen/evtchn.h tmp-linux-2.6-xen.patch/include/xen/evtchn.h
77240 --- ref-linux-2.6.16.9/include/xen/evtchn.h     1970-01-01 01:00:00.000000000 +0100
77241 +++ tmp-linux-2.6-xen.patch/include/xen/evtchn.h        2006-04-10 00:05:52.000000000 +0200
77242 @@ -0,0 +1,126 @@
77243 +/******************************************************************************
77244 + * evtchn.h
77245 + * 
77246 + * Communication via Xen event channels.
77247 + * Also definitions for the device that demuxes notifications to userspace.
77248 + * 
77249 + * Copyright (c) 2004-2005, K A Fraser
77250 + * 
77251 + * This program is free software; you can redistribute it and/or
77252 + * modify it under the terms of the GNU General Public License version 2
77253 + * as published by the Free Software Foundation; or, when distributed
77254 + * separately from the Linux kernel or incorporated into other
77255 + * software packages, subject to the following license:
77256 + * 
77257 + * Permission is hereby granted, free of charge, to any person obtaining a copy
77258 + * of this source file (the "Software"), to deal in the Software without
77259 + * restriction, including without limitation the rights to use, copy, modify,
77260 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
77261 + * and to permit persons to whom the Software is furnished to do so, subject to
77262 + * the following conditions:
77263 + * 
77264 + * The above copyright notice and this permission notice shall be included in
77265 + * all copies or substantial portions of the Software.
77266 + * 
77267 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
77268 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
77269 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77270 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
77271 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
77272 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
77273 + * IN THE SOFTWARE.
77274 + */
77275 +
77276 +#ifndef __ASM_EVTCHN_H__
77277 +#define __ASM_EVTCHN_H__
77278 +
77279 +#include <linux/config.h>
77280 +#include <linux/interrupt.h>
77281 +#include <asm/hypervisor.h>
77282 +#include <asm/ptrace.h>
77283 +#include <asm/synch_bitops.h>
77284 +#include <xen/interface/event_channel.h>
77285 +#include <linux/smp.h>
77286 +
77287 +/*
77288 + * LOW-LEVEL DEFINITIONS
77289 + */
77290 +
77291 +/*
77292 + * Dynamically bind an event source to an IRQ-like callback handler.
77293 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
77294 + * The IRQ argument passed to the callback handler is the same as returned
77295 + * from the bind call. It may not correspond to a Linux IRQ number.
77296 + * Returns IRQ or negative errno.
77297 + * UNBIND: Takes IRQ to unbind from; automatically closes the event channel.
77298 + */
77299 +extern int bind_evtchn_to_irqhandler(
77300 +       unsigned int evtchn,
77301 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
77302 +       unsigned long irqflags,
77303 +       const char *devname,
77304 +       void *dev_id);
77305 +extern int bind_virq_to_irqhandler(
77306 +       unsigned int virq,
77307 +       unsigned int cpu,
77308 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
77309 +       unsigned long irqflags,
77310 +       const char *devname,
77311 +       void *dev_id);
77312 +extern int bind_ipi_to_irqhandler(
77313 +       unsigned int ipi,
77314 +       unsigned int cpu,
77315 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
77316 +       unsigned long irqflags,
77317 +       const char *devname,
77318 +       void *dev_id);
77319 +
77320 +/*
77321 + * Common unbind function for all event sources. Takes IRQ to unbind from.
77322 + * Automatically closes the underlying event channel (even for bindings
77323 + * made with bind_evtchn_to_irqhandler()).
77324 + */
77325 +extern void unbind_from_irqhandler(unsigned int irq, void *dev_id);
77326 +
77327 +extern void irq_resume(void);
77328 +
77329 +/* Entry point for notifications into Linux subsystems. */
77330 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
77331 +
77332 +/* Entry point for notifications into the userland character device. */
77333 +extern void evtchn_device_upcall(int port);
77334 +
77335 +extern void mask_evtchn(int port);
77336 +extern void unmask_evtchn(int port);
77337 +
77338 +static inline void clear_evtchn(int port)
77339 +{
77340 +       shared_info_t *s = HYPERVISOR_shared_info;
77341 +       synch_clear_bit(port, &s->evtchn_pending[0]);
77342 +}
77343 +
77344 +static inline void notify_remote_via_evtchn(int port)
77345 +{
77346 +       evtchn_op_t op;
77347 +       op.cmd         = EVTCHNOP_send,
77348 +       op.u.send.port = port;
77349 +       (void)HYPERVISOR_event_channel_op(&op);
77350 +}
77351 +
77352 +/*
77353 + * Unlike notify_remote_via_evtchn(), this is safe to use across
77354 + * save/restore. Notifications on a broken connection are silently dropped.
77355 + */
77356 +extern void notify_remote_via_irq(int irq);
77357 +
77358 +#endif /* __ASM_EVTCHN_H__ */
77359 +
77360 +/*
77361 + * Local variables:
77362 + *  c-file-style: "linux"
77363 + *  indent-tabs-mode: t
77364 + *  c-indent-level: 8
77365 + *  c-basic-offset: 8
77366 + *  tab-width: 8
77367 + * End:
77368 + */
77369 diff -Nurp ref-linux-2.6.16.9/include/xen/features.h tmp-linux-2.6-xen.patch/include/xen/features.h
77370 --- ref-linux-2.6.16.9/include/xen/features.h   1970-01-01 01:00:00.000000000 +0100
77371 +++ tmp-linux-2.6-xen.patch/include/xen/features.h      2006-04-10 00:05:52.000000000 +0200
77372 @@ -0,0 +1,20 @@
77373 +/******************************************************************************
77374 + * features.h
77375 + *
77376 + * Query the features reported by Xen.
77377 + *
77378 + * Copyright (c) 2006, Ian Campbell
77379 + */
77380 +
77381 +#ifndef __ASM_XEN_FEATURES_H__
77382 +#define __ASM_XEN_FEATURES_H__
77383 +
77384 +#include <xen/interface/version.h>
77385 +
77386 +extern void setup_xen_features(void);
77387 +
77388 +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
77389 +
77390 +#define xen_feature(flag)      (xen_features[flag])
77391 +
77392 +#endif /* __ASM_XEN_FEATURES_H__ */
77393 diff -Nurp ref-linux-2.6.16.9/include/xen/foreign_page.h tmp-linux-2.6-xen.patch/include/xen/foreign_page.h
77394 --- ref-linux-2.6.16.9/include/xen/foreign_page.h       1970-01-01 01:00:00.000000000 +0100
77395 +++ tmp-linux-2.6-xen.patch/include/xen/foreign_page.h  2006-04-10 00:05:52.000000000 +0200
77396 @@ -0,0 +1,40 @@
77397 +/******************************************************************************
77398 + * foreign_page.h
77399 + * 
77400 + * Provide a "foreign" page type, that is owned by a foreign allocator and 
77401 + * not the normal buddy allocator in page_alloc.c
77402 + * 
77403 + * Copyright (c) 2004, K A Fraser
77404 + */
77405 +
77406 +#ifndef __ASM_XEN_FOREIGN_PAGE_H__
77407 +#define __ASM_XEN_FOREIGN_PAGE_H__
77408 +
77409 +#define PG_foreign             PG_arch_1
77410 +
77411 +#define PageForeign(page)      test_bit(PG_foreign, &(page)->flags)
77412 +
77413 +#define SetPageForeign(page, dtor) do {                \
77414 +       set_bit(PG_foreign, &(page)->flags);    \
77415 +       (page)->mapping = (void *)dtor;         \
77416 +} while (0)
77417 +
77418 +#define ClearPageForeign(page) do {            \
77419 +       clear_bit(PG_foreign, &(page)->flags);  \
77420 +       (page)->mapping = NULL;                 \
77421 +} while (0)
77422 +
77423 +#define PageForeignDestructor(page)    \
77424 +       ( (void (*) (struct page *)) (page)->mapping )
77425 +
77426 +#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */
77427 +
77428 +/*
77429 + * Local variables:
77430 + *  c-file-style: "linux"
77431 + *  indent-tabs-mode: t
77432 + *  c-indent-level: 8
77433 + *  c-basic-offset: 8
77434 + *  tab-width: 8
77435 + * End:
77436 + */
77437 diff -Nurp ref-linux-2.6.16.9/include/xen/gnttab.h tmp-linux-2.6-xen.patch/include/xen/gnttab.h
77438 --- ref-linux-2.6.16.9/include/xen/gnttab.h     1970-01-01 01:00:00.000000000 +0100
77439 +++ tmp-linux-2.6-xen.patch/include/xen/gnttab.h        2006-04-10 00:05:52.000000000 +0200
77440 @@ -0,0 +1,126 @@
77441 +/******************************************************************************
77442 + * gnttab.h
77443 + * 
77444 + * Two sets of functionality:
77445 + * 1. Granting foreign access to our memory reservation.
77446 + * 2. Accessing others' memory reservations via grant references.
77447 + * (i.e., mechanisms for both sender and recipient of grant references)
77448 + * 
77449 + * Copyright (c) 2004-2005, K A Fraser
77450 + * Copyright (c) 2005, Christopher Clark
77451 + * 
77452 + * This program is free software; you can redistribute it and/or
77453 + * modify it under the terms of the GNU General Public License version 2
77454 + * as published by the Free Software Foundation; or, when distributed
77455 + * separately from the Linux kernel or incorporated into other
77456 + * software packages, subject to the following license:
77457 + * 
77458 + * Permission is hereby granted, free of charge, to any person obtaining a copy
77459 + * of this source file (the "Software"), to deal in the Software without
77460 + * restriction, including without limitation the rights to use, copy, modify,
77461 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
77462 + * and to permit persons to whom the Software is furnished to do so, subject to
77463 + * the following conditions:
77464 + * 
77465 + * The above copyright notice and this permission notice shall be included in
77466 + * all copies or substantial portions of the Software.
77467 + * 
77468 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
77469 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
77470 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77471 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
77472 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
77473 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
77474 + * IN THE SOFTWARE.
77475 + */
77476 +
77477 +#ifndef __ASM_GNTTAB_H__
77478 +#define __ASM_GNTTAB_H__
77479 +
77480 +#include <linux/config.h>
77481 +#include <asm/hypervisor.h>
77482 +#include <xen/interface/grant_table.h>
77483 +
77484 +/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
77485 +#ifdef __ia64__
77486 +#define NR_GRANT_FRAMES 1
77487 +#else
77488 +#define NR_GRANT_FRAMES 4
77489 +#endif
77490 +
77491 +struct gnttab_free_callback {
77492 +       struct gnttab_free_callback *next;
77493 +       void (*fn)(void *);
77494 +       void *arg;
77495 +       u16 count;
77496 +};
77497 +
77498 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
77499 +                               int readonly);
77500 +
77501 +/*
77502 + * End access through the given grant reference, iff the grant entry is no
77503 + * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
77504 + * use.
77505 + */
77506 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
77507 +
77508 +/*
77509 + * Eventually end access through the given grant reference, and once that
77510 + * access has been ended, free the given page too.  Access will be ended
77511 + * immediately iff the grant entry is not in use, otherwise it will happen
77512 + * some time later.  page may be 0, in which case no freeing will occur.
77513 + */
77514 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
77515 +                              unsigned long page);
77516 +
77517 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
77518 +
77519 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
77520 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
77521 +
77522 +int gnttab_query_foreign_access(grant_ref_t ref);
77523 +
77524 +/*
77525 + * operations on reserved batches of grant references
77526 + */
77527 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
77528 +
77529 +void gnttab_free_grant_reference(grant_ref_t ref);
77530 +
77531 +void gnttab_free_grant_references(grant_ref_t head);
77532 +
77533 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
77534 +
77535 +void gnttab_release_grant_reference(grant_ref_t *private_head,
77536 +                                   grant_ref_t release);
77537 +
77538 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
77539 +                                 void (*fn)(void *), void *arg, u16 count);
77540 +
77541 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
77542 +                                    unsigned long frame, int readonly);
77543 +
77544 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
77545 +                                      unsigned long pfn);
77546 +
77547 +#ifdef __ia64__
77548 +#define gnttab_map_vaddr(map) __va(map.dev_bus_addr)
77549 +#else
77550 +#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
77551 +#endif
77552 +
77553 +int gnttab_suspend(void);
77554 +int gnttab_resume(void);
77555 +
77556 +#endif /* __ASM_GNTTAB_H__ */
77557 +
77558 +/*
77559 + * Local variables:
77560 + *  c-file-style: "linux"
77561 + *  indent-tabs-mode: t
77562 + *  c-indent-level: 8
77563 + *  c-basic-offset: 8
77564 + *  tab-width: 8
77565 + * End:
77566 + */
77567 diff -Nurp ref-linux-2.6.16.9/include/xen/hypervisor_sysfs.h tmp-linux-2.6-xen.patch/include/xen/hypervisor_sysfs.h
77568 --- ref-linux-2.6.16.9/include/xen/hypervisor_sysfs.h   1970-01-01 01:00:00.000000000 +0100
77569 +++ tmp-linux-2.6-xen.patch/include/xen/hypervisor_sysfs.h      2006-04-10 00:05:52.000000000 +0200
77570 @@ -0,0 +1,32 @@
77571 +/*
77572 + *  copyright (c) 2006 IBM Corporation
77573 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
77574 + *
77575 + *  This program is free software; you can redistribute it and/or modify
77576 + *  it under the terms of the GNU General Public License version 2 as
77577 + *  published by the Free Software Foundation.
77578 + */
77579 +
77580 +#ifndef _HYP_SYSFS_H_
77581 +#define _HYP_SYSFS_H_
77582 +
77583 +#include <linux/kobject.h>
77584 +#include <linux/sysfs.h>
77585 +
77586 +#define HYPERVISOR_ATTR_RO(_name) \
77587 +static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
77588 +
77589 +#define HYPERVISOR_ATTR_RW(_name) \
77590 +static struct hyp_sysfs_attr _name##_attr = \
77591 +       __ATTR(_name, 0644, _name##_show, _name##_store)
77592 +
77593 +extern struct subsystem hypervisor_subsys;
77594 +
77595 +struct hyp_sysfs_attr {
77596 +       struct attribute attr;
77597 +       ssize_t (*show)(struct hyp_sysfs_attr *, char *);
77598 +       ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
77599 +       void *hyp_attr_data;
77600 +};
77601 +
77602 +#endif /* _HYP_SYSFS_H_ */
77603 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/acm.h tmp-linux-2.6-xen.patch/include/xen/interface/acm.h
77604 --- ref-linux-2.6.16.9/include/xen/interface/acm.h      1970-01-01 01:00:00.000000000 +0100
77605 +++ tmp-linux-2.6-xen.patch/include/xen/interface/acm.h 2006-04-10 00:05:57.000000000 +0200
77606 @@ -0,0 +1,181 @@
77607 +/*
77608 + * acm.h: Xen access control module interface defintions
77609 + *
77610 + * Reiner Sailer <sailer@watson.ibm.com>
77611 + * Copyright (c) 2005, International Business Machines Corporation.
77612 + */
77613 +
77614 +#ifndef _XEN_PUBLIC_ACM_H
77615 +#define _XEN_PUBLIC_ACM_H
77616 +
77617 +#include "xen.h"
77618 +#include "sched_ctl.h"
77619 +
77620 +/* if ACM_DEBUG defined, all hooks should
77621 + * print a short trace message (comment it out
77622 + * when not in testing mode )
77623 + */
77624 +/* #define ACM_DEBUG */
77625 +
77626 +#ifdef ACM_DEBUG
77627 +#  define printkd(fmt, args...) printk(fmt,## args)
77628 +#else
77629 +#  define printkd(fmt, args...)
77630 +#endif
77631 +
77632 +/* default ssid reference value if not supplied */
77633 +#define ACM_DEFAULT_SSID  0x0
77634 +#define ACM_DEFAULT_LOCAL_SSID  0x0
77635 +
77636 +/* Internal ACM ERROR types */
77637 +#define ACM_OK     0
77638 +#define ACM_UNDEF   -1
77639 +#define ACM_INIT_SSID_ERROR  -2
77640 +#define ACM_INIT_SOID_ERROR  -3
77641 +#define ACM_ERROR          -4
77642 +
77643 +/* External ACCESS DECISIONS */
77644 +#define ACM_ACCESS_PERMITTED        0
77645 +#define ACM_ACCESS_DENIED           -111
77646 +#define ACM_NULL_POINTER_ERROR      -200
77647 +
77648 +/* primary policy in lower 4 bits */
77649 +#define ACM_NULL_POLICY 0
77650 +#define ACM_CHINESE_WALL_POLICY 1
77651 +#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
77652 +#define ACM_POLICY_UNDEFINED 15
77653 +
77654 +/* combinations have secondary policy component in higher 4bit */
77655 +#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
77656 +    ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
77657 +
77658 +/* policy: */
77659 +#define ACM_POLICY_NAME(X) \
77660 + ((X) == (ACM_NULL_POLICY)) ? "NULL policy" :                        \
77661 +    ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL policy" :        \
77662 +    ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT policy" : \
77663 +    ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT policy" : \
77664 +     "UNDEFINED policy"
77665 +
77666 +/* the following policy versions must be increased
77667 + * whenever the interpretation of the related
77668 + * policy's data structure changes
77669 + */
77670 +#define ACM_POLICY_VERSION 1
77671 +#define ACM_CHWALL_VERSION 1
77672 +#define ACM_STE_VERSION  1
77673 +
77674 +/* defines a ssid reference used by xen */
77675 +typedef uint32_t ssidref_t;
77676 +
77677 +/* hooks that are known to domains */
77678 +enum acm_hook_type {NONE=0, SHARING};
77679 +
77680 +/* -------security policy relevant type definitions-------- */
77681 +
77682 +/* type identifier; compares to "equal" or "not equal" */
77683 +typedef uint16_t domaintype_t;
77684 +
77685 +/* CHINESE WALL POLICY DATA STRUCTURES
77686 + *
77687 + * current accumulated conflict type set:
77688 + * When a domain is started and has a type that is in
77689 + * a conflict set, the conflicting types are incremented in
77690 + * the aggregate set. When a domain is destroyed, the 
77691 + * conflicting types to its type are decremented.
77692 + * If a domain has multiple types, this procedure works over
77693 + * all those types.
77694 + *
77695 + * conflict_aggregate_set[i] holds the number of
77696 + *   running domains that have a conflict with type i.
77697 + *
77698 + * running_types[i] holds the number of running domains
77699 + *        that include type i in their ssidref-referenced type set
77700 + *
77701 + * conflict_sets[i][j] is "0" if type j has no conflict
77702 + *    with type i and is "1" otherwise.
77703 + */
77704 +/* high-16 = version, low-16 = check magic */
77705 +#define ACM_MAGIC  0x0001debc
77706 +
77707 +/* each offset in bytes from start of the struct they
77708 + * are part of */
77709 +
77710 +/* each buffer consists of all policy information for
77711 + * the respective policy given in the policy code
77712 + *
77713 + * acm_policy_buffer, acm_chwall_policy_buffer,
77714 + * and acm_ste_policy_buffer need to stay 32-bit aligned
77715 + * because we create binary policies also with external
77716 + * tools that assume packed representations (e.g. the java tool)
77717 + */
77718 +struct acm_policy_buffer {
77719 +    uint32_t policy_version; /* ACM_POLICY_VERSION */
77720 +    uint32_t magic;
77721 +    uint32_t len;
77722 +    uint32_t primary_policy_code;
77723 +    uint32_t primary_buffer_offset;
77724 +    uint32_t secondary_policy_code;
77725 +    uint32_t secondary_buffer_offset;
77726 +};
77727 +
77728 +struct acm_chwall_policy_buffer {
77729 +    uint32_t policy_version; /* ACM_CHWALL_VERSION */
77730 +    uint32_t policy_code;
77731 +    uint32_t chwall_max_types;
77732 +    uint32_t chwall_max_ssidrefs;
77733 +    uint32_t chwall_max_conflictsets;
77734 +    uint32_t chwall_ssid_offset;
77735 +    uint32_t chwall_conflict_sets_offset;
77736 +    uint32_t chwall_running_types_offset;
77737 +    uint32_t chwall_conflict_aggregate_offset;
77738 +};
77739 +
77740 +struct acm_ste_policy_buffer {
77741 +    uint32_t policy_version; /* ACM_STE_VERSION */
77742 +    uint32_t policy_code;
77743 +    uint32_t ste_max_types;
77744 +    uint32_t ste_max_ssidrefs;
77745 +    uint32_t ste_ssid_offset;
77746 +};
77747 +
77748 +struct acm_stats_buffer {
77749 +    uint32_t magic;
77750 +    uint32_t len;
77751 +    uint32_t primary_policy_code;
77752 +    uint32_t primary_stats_offset;
77753 +    uint32_t secondary_policy_code;
77754 +    uint32_t secondary_stats_offset;
77755 +};
77756 +
77757 +struct acm_ste_stats_buffer {
77758 +    uint32_t ec_eval_count;
77759 +    uint32_t gt_eval_count;
77760 +    uint32_t ec_denied_count;
77761 +    uint32_t gt_denied_count;
77762 +    uint32_t ec_cachehit_count;
77763 +    uint32_t gt_cachehit_count;
77764 +};
77765 +
77766 +struct acm_ssid_buffer {
77767 +    uint32_t len;
77768 +    ssidref_t ssidref;
77769 +    uint32_t primary_policy_code;
77770 +    uint32_t primary_max_types;
77771 +    uint32_t primary_types_offset;
77772 +    uint32_t secondary_policy_code;
77773 +    uint32_t secondary_max_types;
77774 +    uint32_t secondary_types_offset;
77775 +};
77776 +
77777 +#endif
77778 +
77779 +/*
77780 + * Local variables:
77781 + * mode: C
77782 + * c-set-style: "BSD"
77783 + * c-basic-offset: 4
77784 + * tab-width: 4
77785 + * indent-tabs-mode: nil
77786 + * End:
77787 + */
77788 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/acm_ops.h tmp-linux-2.6-xen.patch/include/xen/interface/acm_ops.h
77789 --- ref-linux-2.6.16.9/include/xen/interface/acm_ops.h  1970-01-01 01:00:00.000000000 +0100
77790 +++ tmp-linux-2.6-xen.patch/include/xen/interface/acm_ops.h     2006-04-10 00:05:57.000000000 +0200
77791 @@ -0,0 +1,98 @@
77792 +/*
77793 + * acm_ops.h: Xen access control module hypervisor commands
77794 + *
77795 + * Reiner Sailer <sailer@watson.ibm.com>
77796 + * Copyright (c) 2005, International Business Machines Corporation.
77797 + */
77798 +
77799 +#ifndef __XEN_PUBLIC_ACM_OPS_H__
77800 +#define __XEN_PUBLIC_ACM_OPS_H__
77801 +
77802 +#include "xen.h"
77803 +#include "sched_ctl.h"
77804 +#include "acm.h"
77805 +
77806 +/*
77807 + * Make sure you increment the interface version whenever you modify this file!
77808 + * This makes sure that old versions of acm tools will stop working in a
77809 + * well-defined way (rather than crashing the machine, for instance).
77810 + */
77811 +#define ACM_INTERFACE_VERSION   0xAAAA0005
77812 +
77813 +/************************************************************************/
77814 +
77815 +#define ACM_SETPOLICY         4
77816 +struct acm_setpolicy {
77817 +    /* OUT variables */
77818 +    void *pushcache;
77819 +    uint32_t pushcache_size;
77820 +};
77821 +
77822 +
77823 +#define ACM_GETPOLICY         5
77824 +struct acm_getpolicy {
77825 +    /* OUT variables */
77826 +    void *pullcache;
77827 +    uint32_t pullcache_size;
77828 +};
77829 +
77830 +
77831 +#define ACM_DUMPSTATS         6
77832 +struct acm_dumpstats {
77833 +    void *pullcache;
77834 +    uint32_t pullcache_size;
77835 +};
77836 +
77837 +
77838 +#define ACM_GETSSID           7
77839 +enum get_type {UNSET=0, SSIDREF, DOMAINID};
77840 +struct acm_getssid {
77841 +    enum get_type get_ssid_by;
77842 +    union {
77843 +        domaintype_t domainid;
77844 +        ssidref_t    ssidref;
77845 +    } id;
77846 +    void *ssidbuf;
77847 +    uint32_t ssidbuf_size;
77848 +};
77849 +
77850 +#define ACM_GETDECISION        8
77851 +struct acm_getdecision {
77852 +    enum get_type get_decision_by1; /* in */
77853 +    enum get_type get_decision_by2;
77854 +    union {
77855 +        domaintype_t domainid;
77856 +        ssidref_t    ssidref;
77857 +    } id1;
77858 +    union {
77859 +        domaintype_t domainid;
77860 +        ssidref_t    ssidref;
77861 +    } id2;
77862 +    enum acm_hook_type hook;
77863 +    int acm_decision;           /* out */
77864 +};
77865 +
77866 +typedef struct acm_op {
77867 +    uint32_t cmd;
77868 +    uint32_t interface_version;      /* ACM_INTERFACE_VERSION */
77869 +    union {
77870 +        struct acm_setpolicy setpolicy;
77871 +        struct acm_getpolicy getpolicy;
77872 +        struct acm_dumpstats dumpstats;
77873 +        struct acm_getssid getssid;
77874 +        struct acm_getdecision getdecision;
77875 +    } u;
77876 +} acm_op_t;
77877 +DEFINE_GUEST_HANDLE(acm_op_t);
77878 +
77879 +#endif                          /* __XEN_PUBLIC_ACM_OPS_H__ */
77880 +
77881 +/*
77882 + * Local variables:
77883 + * mode: C
77884 + * c-set-style: "BSD"
77885 + * c-basic-offset: 4
77886 + * tab-width: 4
77887 + * indent-tabs-mode: nil
77888 + * End:
77889 + */
77890 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/arch-ia64.h tmp-linux-2.6-xen.patch/include/xen/interface/arch-ia64.h
77891 --- ref-linux-2.6.16.9/include/xen/interface/arch-ia64.h        1970-01-01 01:00:00.000000000 +0100
77892 +++ tmp-linux-2.6-xen.patch/include/xen/interface/arch-ia64.h   2006-04-10 00:05:57.000000000 +0200
77893 @@ -0,0 +1,337 @@
77894 +/******************************************************************************
77895 + * arch-ia64/hypervisor-if.h
77896 + * 
77897 + * Guest OS interface to IA64 Xen.
77898 + */
77899 +
77900 +#ifndef __HYPERVISOR_IF_IA64_H__
77901 +#define __HYPERVISOR_IF_IA64_H__
77902 +
77903 +#ifdef __XEN__
77904 +#define __DEFINE_GUEST_HANDLE(name, type) \
77905 +    typedef struct { type *p; } __guest_handle_ ## name
77906 +#else
77907 +#define __DEFINE_GUEST_HANDLE(name, type) \
77908 +    typedef type * __guest_handle_ ## name
77909 +#endif
77910 +
77911 +#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
77912 +#define GUEST_HANDLE(name)        __guest_handle_ ## name
77913 +
77914 +#ifndef __ASSEMBLY__
77915 +/* Guest handles for primitive C types. */
77916 +__DEFINE_GUEST_HANDLE(uchar, unsigned char);
77917 +__DEFINE_GUEST_HANDLE(uint,  unsigned int);
77918 +__DEFINE_GUEST_HANDLE(ulong, unsigned long);
77919 +DEFINE_GUEST_HANDLE(char);
77920 +DEFINE_GUEST_HANDLE(int);
77921 +DEFINE_GUEST_HANDLE(long);
77922 +DEFINE_GUEST_HANDLE(void);
77923 +#endif
77924 +
77925 +/* Maximum number of virtual CPUs in multi-processor guests. */
77926 +/* WARNING: before changing this, check that shared_info fits on a page */
77927 +#define MAX_VIRT_CPUS 4
77928 +
77929 +#ifndef __ASSEMBLY__
77930 +
77931 +#define MAX_NR_SECTION  32  /* at most 32 memory holes */
77932 +typedef struct {
77933 +    unsigned long start;  /* start of memory hole */
77934 +    unsigned long end;    /* end of memory hole */
77935 +} mm_section_t;
77936 +
77937 +typedef struct {
77938 +    unsigned long mfn : 56;
77939 +    unsigned long type: 8;
77940 +} pmt_entry_t;
77941 +
77942 +#define GPFN_MEM          (0UL << 56) /* Guest pfn is normal mem */
77943 +#define GPFN_FRAME_BUFFER (1UL << 56) /* VGA framebuffer */
77944 +#define GPFN_LOW_MMIO     (2UL << 56) /* Low MMIO range */
77945 +#define GPFN_PIB          (3UL << 56) /* PIB base */
77946 +#define GPFN_IOSAPIC      (4UL << 56) /* IOSAPIC base */
77947 +#define GPFN_LEGACY_IO    (5UL << 56) /* Legacy I/O base */
77948 +#define GPFN_GFW          (6UL << 56) /* Guest Firmware */
77949 +#define GPFN_HIGH_MMIO    (7UL << 56) /* High MMIO range */
77950 +
77951 +#define GPFN_IO_MASK     (7UL << 56)  /* Guest pfn is I/O type */
77952 +#define GPFN_INV_MASK    (31UL << 59) /* Guest pfn is invalid */
77953 +
77954 +#define INVALID_MFN       (~0UL)
77955 +
77956 +#define MEM_G   (1UL << 30)
77957 +#define MEM_M   (1UL << 20)
77958 +
77959 +#define MMIO_START       (3 * MEM_G)
77960 +#define MMIO_SIZE        (512 * MEM_M)
77961 +
77962 +#define VGA_IO_START     0xA0000UL
77963 +#define VGA_IO_SIZE      0x20000
77964 +
77965 +#define LEGACY_IO_START  (MMIO_START + MMIO_SIZE)
77966 +#define LEGACY_IO_SIZE   (64*MEM_M)
77967 +
77968 +#define IO_PAGE_START (LEGACY_IO_START + LEGACY_IO_SIZE)
77969 +#define IO_PAGE_SIZE  PAGE_SIZE
77970 +
77971 +#define STORE_PAGE_START (IO_PAGE_START + IO_PAGE_SIZE)
77972 +#define STORE_PAGE_SIZE         PAGE_SIZE
77973 +
77974 +#define IO_SAPIC_START   0xfec00000UL
77975 +#define IO_SAPIC_SIZE    0x100000
77976 +
77977 +#define PIB_START 0xfee00000UL
77978 +#define PIB_SIZE 0x100000
77979 +
77980 +#define GFW_START        (4*MEM_G -16*MEM_M)
77981 +#define GFW_SIZE         (16*MEM_M)
77982 +
77983 +/*
77984 + * NB. This may become a 64-bit count with no shift. If this happens then the 
77985 + * structure size will still be 8 bytes, so no other alignments will change.
77986 + */
77987 +typedef struct {
77988 +    unsigned int  tsc_bits;      /* 0: 32 bits read from the CPU's TSC. */
77989 +    unsigned int  tsc_bitshift;  /* 4: 'tsc_bits' uses N:N+31 of TSC.   */
77990 +} tsc_timestamp_t; /* 8 bytes */
77991 +
77992 +struct pt_fpreg {
77993 +    union {
77994 +        unsigned long bits[2];
77995 +        long double __dummy;    /* force 16-byte alignment */
77996 +    } u;
77997 +};
77998 +
77999 +typedef struct cpu_user_regs{
78000 +    /* The following registers are saved by SAVE_MIN: */
78001 +    unsigned long b6;  /* scratch */
78002 +    unsigned long b7;  /* scratch */
78003 +
78004 +    unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
78005 +    unsigned long ar_ssd; /* reserved for future use (scratch) */
78006 +
78007 +    unsigned long r8;  /* scratch (return value register 0) */
78008 +    unsigned long r9;  /* scratch (return value register 1) */
78009 +    unsigned long r10; /* scratch (return value register 2) */
78010 +    unsigned long r11; /* scratch (return value register 3) */
78011 +
78012 +    unsigned long cr_ipsr; /* interrupted task's psr */
78013 +    unsigned long cr_iip;  /* interrupted task's instruction pointer */
78014 +    unsigned long cr_ifs;  /* interrupted task's function state */
78015 +
78016 +    unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
78017 +    unsigned long ar_pfs;  /* prev function state  */
78018 +    unsigned long ar_rsc;  /* RSE configuration */
78019 +    /* The following two are valid only if cr_ipsr.cpl > 0: */
78020 +    unsigned long ar_rnat;  /* RSE NaT */
78021 +    unsigned long ar_bspstore; /* RSE bspstore */
78022 +
78023 +    unsigned long pr;  /* 64 predicate registers (1 bit each) */
78024 +    unsigned long b0;  /* return pointer (bp) */
78025 +    unsigned long loadrs;  /* size of dirty partition << 16 */
78026 +
78027 +    unsigned long r1;  /* the gp pointer */
78028 +    unsigned long r12; /* interrupted task's memory stack pointer */
78029 +    unsigned long r13; /* thread pointer */
78030 +
78031 +    unsigned long ar_fpsr;  /* floating point status (preserved) */
78032 +    unsigned long r15;  /* scratch */
78033 +
78034 + /* The remaining registers are NOT saved for system calls.  */
78035 +
78036 +    unsigned long r14;  /* scratch */
78037 +    unsigned long r2;  /* scratch */
78038 +    unsigned long r3;  /* scratch */
78039 +    unsigned long r16;  /* scratch */
78040 +    unsigned long r17;  /* scratch */
78041 +    unsigned long r18;  /* scratch */
78042 +    unsigned long r19;  /* scratch */
78043 +    unsigned long r20;  /* scratch */
78044 +    unsigned long r21;  /* scratch */
78045 +    unsigned long r22;  /* scratch */
78046 +    unsigned long r23;  /* scratch */
78047 +    unsigned long r24;  /* scratch */
78048 +    unsigned long r25;  /* scratch */
78049 +    unsigned long r26;  /* scratch */
78050 +    unsigned long r27;  /* scratch */
78051 +    unsigned long r28;  /* scratch */
78052 +    unsigned long r29;  /* scratch */
78053 +    unsigned long r30;  /* scratch */
78054 +    unsigned long r31;  /* scratch */
78055 +    unsigned long ar_ccv;  /* compare/exchange value (scratch) */
78056 +
78057 +    /*
78058 +     * Floating point registers that the kernel considers scratch:
78059 +     */
78060 +    struct pt_fpreg f6;  /* scratch */
78061 +    struct pt_fpreg f7;  /* scratch */
78062 +    struct pt_fpreg f8;  /* scratch */
78063 +    struct pt_fpreg f9;  /* scratch */
78064 +    struct pt_fpreg f10;  /* scratch */
78065 +    struct pt_fpreg f11;  /* scratch */
78066 +    unsigned long r4;  /* preserved */
78067 +    unsigned long r5;  /* preserved */
78068 +    unsigned long r6;  /* preserved */
78069 +    unsigned long r7;  /* preserved */
78070 +    unsigned long eml_unat;    /* used for emulating instruction */
78071 +    unsigned long rfi_pfs;     /* used for elulating rfi */
78072 +
78073 +}cpu_user_regs_t;
78074 +
78075 +typedef union {
78076 +    unsigned long value;
78077 +    struct {
78078 +        int a_int:1;
78079 +        int a_from_int_cr:1;
78080 +        int a_to_int_cr:1;
78081 +        int a_from_psr:1;
78082 +        int a_from_cpuid:1;
78083 +        int a_cover:1;
78084 +        int a_bsw:1;
78085 +        long reserved:57;
78086 +    };
78087 +} vac_t;
78088 +
78089 +typedef union {
78090 +    unsigned long value;
78091 +    struct {
78092 +        int d_vmsw:1;
78093 +        int d_extint:1;
78094 +        int d_ibr_dbr:1;
78095 +        int d_pmc:1;
78096 +        int d_to_pmd:1;
78097 +        int d_itm:1;
78098 +        long reserved:58;
78099 +    };
78100 +} vdc_t;
78101 +
78102 +typedef struct {
78103 +    vac_t   vac;
78104 +    vdc_t   vdc;
78105 +    unsigned long  virt_env_vaddr;
78106 +    unsigned long  reserved1[29];
78107 +    unsigned long  vhpi;
78108 +    unsigned long  reserved2[95];
78109 +    union {
78110 +        unsigned long  vgr[16];
78111 +        unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active
78112 +    };
78113 +    union {
78114 +        unsigned long  vbgr[16];
78115 +        unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active
78116 +    };
78117 +    unsigned long  vnat;
78118 +    unsigned long  vbnat;
78119 +    unsigned long  vcpuid[5];
78120 +    unsigned long  reserved3[11];
78121 +    unsigned long  vpsr;
78122 +    unsigned long  vpr;
78123 +    unsigned long  reserved4[76];
78124 +    union {
78125 +        unsigned long  vcr[128];
78126 +        struct {
78127 +            unsigned long dcr;  // CR0
78128 +            unsigned long itm;
78129 +            unsigned long iva;
78130 +            unsigned long rsv1[5];
78131 +            unsigned long pta;  // CR8
78132 +            unsigned long rsv2[7];
78133 +            unsigned long ipsr;  // CR16
78134 +            unsigned long isr;
78135 +            unsigned long rsv3;
78136 +            unsigned long iip;
78137 +            unsigned long ifa;
78138 +            unsigned long itir;
78139 +            unsigned long iipa;
78140 +            unsigned long ifs;
78141 +            unsigned long iim;  // CR24
78142 +            unsigned long iha;
78143 +            unsigned long rsv4[38];
78144 +            unsigned long lid;  // CR64
78145 +            unsigned long ivr;
78146 +            unsigned long tpr;
78147 +            unsigned long eoi;
78148 +            unsigned long irr[4];
78149 +            unsigned long itv;  // CR72
78150 +            unsigned long pmv;
78151 +            unsigned long cmcv;
78152 +            unsigned long rsv5[5];
78153 +            unsigned long lrr0;  // CR80
78154 +            unsigned long lrr1;
78155 +            unsigned long rsv6[46];
78156 +        };
78157 +    };
78158 +    union {
78159 +        unsigned long  reserved5[128];
78160 +        struct {
78161 +            unsigned long precover_ifs;
78162 +            unsigned long unat;  // not sure if this is needed until NaT arch is done
78163 +            int interrupt_collection_enabled; // virtual psr.ic
78164 +            int interrupt_delivery_enabled; // virtual psr.i
78165 +            int pending_interruption;
78166 +            int incomplete_regframe; // see SDM vol2 6.8
78167 +            unsigned long reserved5_1[4];
78168 +            int metaphysical_mode; // 1 = use metaphys mapping, 0 = use virtual
78169 +            int banknum; // 0 or 1, which virtual register bank is active
78170 +            unsigned long rrs[8]; // region registers
78171 +            unsigned long krs[8]; // kernel registers
78172 +            unsigned long pkrs[8]; // protection key registers
78173 +            unsigned long tmp[8]; // temp registers (e.g. for hyperprivops)
78174 +            // FIXME: tmp[8] temp'ly being used for virtual psr.pp
78175 +        };
78176 +    };
78177 +    unsigned long  reserved6[3456];
78178 +    unsigned long  vmm_avail[128];
78179 +    unsigned long  reserved7[4096];
78180 +} mapped_regs_t;
78181 +
78182 +typedef struct {
78183 +    mapped_regs_t *privregs;
78184 +    int evtchn_vector;
78185 +} arch_vcpu_info_t;
78186 +
78187 +typedef mapped_regs_t vpd_t;
78188 +
78189 +typedef struct {
78190 +    unsigned int flags;
78191 +    unsigned long start_info_pfn;
78192 +} arch_shared_info_t;
78193 +
78194 +typedef struct {
78195 +    unsigned long start;
78196 +    unsigned long size;
78197 +} arch_initrd_info_t;
78198 +
78199 +#define IA64_COMMAND_LINE_SIZE 512
78200 +typedef struct vcpu_guest_context {
78201 +#define VGCF_FPU_VALID (1<<0)
78202 +#define VGCF_VMX_GUEST (1<<1)
78203 +#define VGCF_IN_KERNEL (1<<2)
78204 +    unsigned long flags;       /* VGCF_* flags */
78205 +    unsigned long pt_base;     /* PMT table base */
78206 +    unsigned long share_io_pg; /* Shared page for I/O emulation */
78207 +    unsigned long sys_pgnr;    /* System pages out of domain memory */
78208 +    unsigned long vm_assist;   /* VMASST_TYPE_* bitmap, now none on IPF */
78209 +
78210 +    cpu_user_regs_t regs;
78211 +    arch_vcpu_info_t vcpu;
78212 +    arch_shared_info_t shared;
78213 +    arch_initrd_info_t initrd;
78214 +    char cmdline[IA64_COMMAND_LINE_SIZE];
78215 +} vcpu_guest_context_t;
78216 +DEFINE_GUEST_HANDLE(vcpu_guest_context_t);
78217 +
78218 +#endif /* !__ASSEMBLY__ */
78219 +
78220 +#endif /* __HYPERVISOR_IF_IA64_H__ */
78221 +
78222 +/*
78223 + * Local variables:
78224 + * mode: C
78225 + * c-set-style: "BSD"
78226 + * c-basic-offset: 4
78227 + * tab-width: 4
78228 + * indent-tabs-mode: nil
78229 + * End:
78230 + */
78231 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/arch-x86_32.h tmp-linux-2.6-xen.patch/include/xen/interface/arch-x86_32.h
78232 --- ref-linux-2.6.16.9/include/xen/interface/arch-x86_32.h      1970-01-01 01:00:00.000000000 +0100
78233 +++ tmp-linux-2.6-xen.patch/include/xen/interface/arch-x86_32.h 2006-04-10 00:05:57.000000000 +0200
78234 @@ -0,0 +1,195 @@
78235 +/******************************************************************************
78236 + * arch-x86_32.h
78237 + * 
78238 + * Guest OS interface to x86 32-bit Xen.
78239 + * 
78240 + * Copyright (c) 2004, K A Fraser
78241 + */
78242 +
78243 +#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
78244 +#define __XEN_PUBLIC_ARCH_X86_32_H__
78245 +
78246 +#ifdef __XEN__
78247 +#define __DEFINE_GUEST_HANDLE(name, type) \
78248 +    typedef struct { type *p; } __guest_handle_ ## name
78249 +#else
78250 +#define __DEFINE_GUEST_HANDLE(name, type) \
78251 +    typedef type * __guest_handle_ ## name
78252 +#endif
78253 +
78254 +#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
78255 +#define GUEST_HANDLE(name)        __guest_handle_ ## name
78256 +
78257 +#ifndef __ASSEMBLY__
78258 +/* Guest handles for primitive C types. */
78259 +__DEFINE_GUEST_HANDLE(uchar, unsigned char);
78260 +__DEFINE_GUEST_HANDLE(uint,  unsigned int);
78261 +__DEFINE_GUEST_HANDLE(ulong, unsigned long);
78262 +DEFINE_GUEST_HANDLE(char);
78263 +DEFINE_GUEST_HANDLE(int);
78264 +DEFINE_GUEST_HANDLE(long);
78265 +DEFINE_GUEST_HANDLE(void);
78266 +#endif
78267 +
78268 +/*
78269 + * SEGMENT DESCRIPTOR TABLES
78270 + */
78271 +/*
78272 + * A number of GDT entries are reserved by Xen. These are not situated at the
78273 + * start of the GDT because some stupid OSes export hard-coded selector values
78274 + * in their ABI. These hard-coded values are always near the start of the GDT,
78275 + * so Xen places itself out of the way, at the far end of the GDT.
78276 + */
78277 +#define FIRST_RESERVED_GDT_PAGE  14
78278 +#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
78279 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
78280 +
78281 +/*
78282 + * These flat segments are in the Xen-private section of every GDT. Since these
78283 + * are also present in the initial GDT, many OSes will be able to avoid
78284 + * installing their own GDT.
78285 + */
78286 +#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
78287 +#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
78288 +#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
78289 +#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
78290 +#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
78291 +#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
78292 +
78293 +#define FLAT_KERNEL_CS FLAT_RING1_CS
78294 +#define FLAT_KERNEL_DS FLAT_RING1_DS
78295 +#define FLAT_KERNEL_SS FLAT_RING1_SS
78296 +#define FLAT_USER_CS    FLAT_RING3_CS
78297 +#define FLAT_USER_DS    FLAT_RING3_DS
78298 +#define FLAT_USER_SS    FLAT_RING3_SS
78299 +
78300 +/* And the trap vector is... */
78301 +#define TRAP_INSTR "int $0x82"
78302 +
78303 +/*
78304 + * Virtual addresses beyond this are not modifiable by guest OSes. The 
78305 + * machine->physical mapping table starts at this address, read-only.
78306 + */
78307 +#ifdef CONFIG_X86_PAE
78308 +#define __HYPERVISOR_VIRT_START 0xF5800000
78309 +#else
78310 +#define __HYPERVISOR_VIRT_START 0xFC000000
78311 +#endif
78312 +
78313 +#ifndef HYPERVISOR_VIRT_START
78314 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
78315 +#endif
78316 +
78317 +#ifndef machine_to_phys_mapping
78318 +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
78319 +#endif
78320 +
78321 +/* Maximum number of virtual CPUs in multi-processor guests. */
78322 +#define MAX_VIRT_CPUS 32
78323 +
78324 +#ifndef __ASSEMBLY__
78325 +
78326 +/*
78327 + * Send an array of these to HYPERVISOR_set_trap_table()
78328 + */
78329 +#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
78330 +#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
78331 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
78332 +#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
78333 +typedef struct trap_info {
78334 +    uint8_t       vector;  /* exception vector                              */
78335 +    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
78336 +    uint16_t      cs;      /* code selector                                 */
78337 +    unsigned long address; /* code offset                                   */
78338 +} trap_info_t;
78339 +DEFINE_GUEST_HANDLE(trap_info_t);
78340 +
78341 +typedef struct cpu_user_regs {
78342 +    uint32_t ebx;
78343 +    uint32_t ecx;
78344 +    uint32_t edx;
78345 +    uint32_t esi;
78346 +    uint32_t edi;
78347 +    uint32_t ebp;
78348 +    uint32_t eax;
78349 +    uint16_t error_code;    /* private */
78350 +    uint16_t entry_vector;  /* private */
78351 +    uint32_t eip;
78352 +    uint16_t cs;
78353 +    uint8_t  saved_upcall_mask;
78354 +    uint8_t  _pad0;
78355 +    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
78356 +    uint32_t esp;
78357 +    uint16_t ss, _pad1;
78358 +    uint16_t es, _pad2;
78359 +    uint16_t ds, _pad3;
78360 +    uint16_t fs, _pad4;
78361 +    uint16_t gs, _pad5;
78362 +} cpu_user_regs_t;
78363 +DEFINE_GUEST_HANDLE(cpu_user_regs_t);
78364 +
78365 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
78366 +
78367 +/*
78368 + * The following is all CPU context. Note that the fpu_ctxt block is filled 
78369 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
78370 + */
78371 +typedef struct vcpu_guest_context {
78372 +    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
78373 +    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
78374 +#define VGCF_I387_VALID (1<<0)
78375 +#define VGCF_HVM_GUEST  (1<<1)
78376 +#define VGCF_IN_KERNEL  (1<<2)
78377 +    unsigned long flags;                    /* VGCF_* flags                 */
78378 +    cpu_user_regs_t user_regs;              /* User-level CPU registers     */
78379 +    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
78380 +    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
78381 +    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
78382 +    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
78383 +    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
78384 +    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
78385 +    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
78386 +    unsigned long event_callback_eip;
78387 +    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
78388 +    unsigned long failsafe_callback_eip;
78389 +    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
78390 +} vcpu_guest_context_t;
78391 +DEFINE_GUEST_HANDLE(vcpu_guest_context_t);
78392 +
78393 +typedef struct arch_shared_info {
78394 +    unsigned long max_pfn;                  /* max pfn that appears in table */
78395 +    /* Frame containing list of mfns containing list of mfns containing p2m. */
78396 +    unsigned long pfn_to_mfn_frame_list_list;
78397 +    unsigned long nmi_reason;
78398 +} arch_shared_info_t;
78399 +
78400 +typedef struct {
78401 +    unsigned long cr2;
78402 +    unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
78403 +} arch_vcpu_info_t;
78404 +
78405 +#endif /* !__ASSEMBLY__ */
78406 +
78407 +/*
78408 + * Prefix forces emulation of some non-trapping instructions.
78409 + * Currently only CPUID.
78410 + */
78411 +#ifdef __ASSEMBLY__
78412 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
78413 +#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
78414 +#else
78415 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
78416 +#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
78417 +#endif
78418 +
78419 +#endif
78420 +
78421 +/*
78422 + * Local variables:
78423 + * mode: C
78424 + * c-set-style: "BSD"
78425 + * c-basic-offset: 4
78426 + * tab-width: 4
78427 + * indent-tabs-mode: nil
78428 + * End:
78429 + */
78430 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/arch-x86_64.h tmp-linux-2.6-xen.patch/include/xen/interface/arch-x86_64.h
78431 --- ref-linux-2.6.16.9/include/xen/interface/arch-x86_64.h      1970-01-01 01:00:00.000000000 +0100
78432 +++ tmp-linux-2.6-xen.patch/include/xen/interface/arch-x86_64.h 2006-04-10 00:05:57.000000000 +0200
78433 @@ -0,0 +1,271 @@
78434 +/******************************************************************************
78435 + * arch-x86_64.h
78436 + * 
78437 + * Guest OS interface to x86 64-bit Xen.
78438 + * 
78439 + * Copyright (c) 2004, K A Fraser
78440 + */
78441 +
78442 +#ifndef __XEN_PUBLIC_ARCH_X86_64_H__
78443 +#define __XEN_PUBLIC_ARCH_X86_64_H__
78444 +
78445 +#ifdef __XEN__
78446 +#define __DEFINE_GUEST_HANDLE(name, type) \
78447 +    typedef struct { type *p; } __guest_handle_ ## name
78448 +#else
78449 +#define __DEFINE_GUEST_HANDLE(name, type) \
78450 +    typedef type * __guest_handle_ ## name
78451 +#endif
78452 +
78453 +#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
78454 +#define GUEST_HANDLE(name)        __guest_handle_ ## name
78455 +
78456 +#ifndef __ASSEMBLY__
78457 +/* Guest handles for primitive C types. */
78458 +__DEFINE_GUEST_HANDLE(uchar, unsigned char);
78459 +__DEFINE_GUEST_HANDLE(uint,  unsigned int);
78460 +__DEFINE_GUEST_HANDLE(ulong, unsigned long);
78461 +DEFINE_GUEST_HANDLE(char);
78462 +DEFINE_GUEST_HANDLE(int);
78463 +DEFINE_GUEST_HANDLE(long);
78464 +DEFINE_GUEST_HANDLE(void);
78465 +#endif
78466 +
78467 +/*
78468 + * SEGMENT DESCRIPTOR TABLES
78469 + */
78470 +/*
78471 + * A number of GDT entries are reserved by Xen. These are not situated at the
78472 + * start of the GDT because some stupid OSes export hard-coded selector values
78473 + * in their ABI. These hard-coded values are always near the start of the GDT,
78474 + * so Xen places itself out of the way, at the far end of the GDT.
78475 + */
78476 +#define FIRST_RESERVED_GDT_PAGE  14
78477 +#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
78478 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
78479 +
78480 +/*
78481 + * 64-bit segment selectors
78482 + * These flat segments are in the Xen-private section of every GDT. Since these
78483 + * are also present in the initial GDT, many OSes will be able to avoid
78484 + * installing their own GDT.
78485 + */
78486 +
78487 +#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
78488 +#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
78489 +#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
78490 +#define FLAT_RING3_DS64 0x0000  /* NULL selector */
78491 +#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
78492 +#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
78493 +
78494 +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
78495 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
78496 +#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
78497 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
78498 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
78499 +#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
78500 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
78501 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
78502 +#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
78503 +
78504 +#define FLAT_USER_DS64 FLAT_RING3_DS64
78505 +#define FLAT_USER_DS32 FLAT_RING3_DS32
78506 +#define FLAT_USER_DS   FLAT_USER_DS64
78507 +#define FLAT_USER_CS64 FLAT_RING3_CS64
78508 +#define FLAT_USER_CS32 FLAT_RING3_CS32
78509 +#define FLAT_USER_CS   FLAT_USER_CS64
78510 +#define FLAT_USER_SS64 FLAT_RING3_SS64
78511 +#define FLAT_USER_SS32 FLAT_RING3_SS32
78512 +#define FLAT_USER_SS   FLAT_USER_SS64
78513 +
78514 +/* And the trap vector is... */
78515 +#define TRAP_INSTR "syscall"
78516 +
78517 +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
78518 +#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
78519 +
78520 +#ifndef HYPERVISOR_VIRT_START
78521 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
78522 +#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
78523 +#endif
78524 +
78525 +/* Maximum number of virtual CPUs in multi-processor guests. */
78526 +#define MAX_VIRT_CPUS 32
78527 +
78528 +#ifndef __ASSEMBLY__
78529 +
78530 +/* The machine->physical mapping table starts at this address, read-only. */
78531 +#ifndef machine_to_phys_mapping
78532 +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
78533 +#endif
78534 +
78535 +/*
78536 + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
78537 + *  @which == SEGBASE_*  ;  @base == 64-bit base address
78538 + * Returns 0 on success.
78539 + */
78540 +#define SEGBASE_FS          0
78541 +#define SEGBASE_GS_USER     1
78542 +#define SEGBASE_GS_KERNEL   2
78543 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
78544 +
78545 +/*
78546 + * int HYPERVISOR_iret(void)
78547 + * All arguments are on the kernel stack, in the following format.
78548 + * Never returns if successful. Current kernel context is lost.
78549 + * The saved CS is mapped as follows:
78550 + *   RING0 -> RING3 kernel mode.
78551 + *   RING1 -> RING3 kernel mode.
78552 + *   RING2 -> RING3 kernel mode.
78553 + *   RING3 -> RING3 user mode.
78554 + * However RING0 indicates that the guest kernel should return to iteself
78555 + * directly with
78556 + *      orb   $3,1*8(%rsp)
78557 + *      iretq
78558 + * If flags contains VGCF_IN_SYSCALL:
78559 + *   Restore RAX, RIP, RFLAGS, RSP.
78560 + *   Discard R11, RCX, CS, SS.
78561 + * Otherwise:
78562 + *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
78563 + * All other registers are saved on hypercall entry and restored to user.
78564 + */
78565 +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
78566 +#define VGCF_IN_SYSCALL (1<<8)
78567 +struct iret_context {
78568 +    /* Top of stack (%rsp at point of hypercall). */
78569 +    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
78570 +    /* Bottom of iret stack frame. */
78571 +};
78572 +
78573 +/*
78574 + * Send an array of these to HYPERVISOR_set_trap_table().
78575 + * N.B. As in x86/32 mode, the privilege level specifies which modes may enter
78576 + * a trap via a software interrupt. Since rings 1 and 2 are unavailable, we
78577 + * allocate privilege levels as follows:
78578 + *  Level == 0: Noone may enter
78579 + *  Level == 1: Kernel may enter
78580 + *  Level == 2: Kernel may enter
78581 + *  Level == 3: Everyone may enter
78582 + */
78583 +#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
78584 +#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
78585 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
78586 +#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
78587 +typedef struct trap_info {
78588 +    uint8_t       vector;  /* exception vector                              */
78589 +    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
78590 +    uint16_t      cs;      /* code selector                                 */
78591 +    unsigned long address; /* code offset                                   */
78592 +} trap_info_t;
78593 +DEFINE_GUEST_HANDLE(trap_info_t);
78594 +
78595 +#ifdef __GNUC__
78596 +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
78597 +#define __DECL_REG(name) union { uint64_t r ## name, e ## name; }
78598 +#else
78599 +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
78600 +#define __DECL_REG(name) uint64_t r ## name
78601 +#endif
78602 +
78603 +typedef struct cpu_user_regs {
78604 +    uint64_t r15;
78605 +    uint64_t r14;
78606 +    uint64_t r13;
78607 +    uint64_t r12;
78608 +    __DECL_REG(bp);
78609 +    __DECL_REG(bx);
78610 +    uint64_t r11;
78611 +    uint64_t r10;
78612 +    uint64_t r9;
78613 +    uint64_t r8;
78614 +    __DECL_REG(ax);
78615 +    __DECL_REG(cx);
78616 +    __DECL_REG(dx);
78617 +    __DECL_REG(si);
78618 +    __DECL_REG(di);
78619 +    uint32_t error_code;    /* private */
78620 +    uint32_t entry_vector;  /* private */
78621 +    __DECL_REG(ip);
78622 +    uint16_t cs, _pad0[1];
78623 +    uint8_t  saved_upcall_mask;
78624 +    uint8_t  _pad1[3];
78625 +    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
78626 +    __DECL_REG(sp);
78627 +    uint16_t ss, _pad2[3];
78628 +    uint16_t es, _pad3[3];
78629 +    uint16_t ds, _pad4[3];
78630 +    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
78631 +    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
78632 +} cpu_user_regs_t;
78633 +DEFINE_GUEST_HANDLE(cpu_user_regs_t);
78634 +
78635 +#undef __DECL_REG
78636 +
78637 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
78638 +
78639 +/*
78640 + * The following is all CPU context. Note that the fpu_ctxt block is filled 
78641 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
78642 + */
78643 +typedef struct vcpu_guest_context {
78644 +    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
78645 +    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
78646 +#define VGCF_I387_VALID (1<<0)
78647 +#define VGCF_HVM_GUEST  (1<<1)
78648 +#define VGCF_IN_KERNEL  (1<<2)
78649 +    unsigned long flags;                    /* VGCF_* flags                 */
78650 +    cpu_user_regs_t user_regs;              /* User-level CPU registers     */
78651 +    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
78652 +    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
78653 +    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
78654 +    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
78655 +    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
78656 +    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
78657 +    unsigned long event_callback_eip;
78658 +    unsigned long failsafe_callback_eip;
78659 +    unsigned long syscall_callback_eip;
78660 +    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
78661 +    /* Segment base addresses. */
78662 +    uint64_t      fs_base;
78663 +    uint64_t      gs_base_kernel;
78664 +    uint64_t      gs_base_user;
78665 +} vcpu_guest_context_t;
78666 +DEFINE_GUEST_HANDLE(vcpu_guest_context_t);
78667 +
78668 +typedef struct arch_shared_info {
78669 +    unsigned long max_pfn;                  /* max pfn that appears in table */
78670 +    /* Frame containing list of mfns containing list of mfns containing p2m. */
78671 +    unsigned long pfn_to_mfn_frame_list_list;
78672 +    unsigned long nmi_reason;
78673 +} arch_shared_info_t;
78674 +
78675 +typedef struct {
78676 +    unsigned long cr2;
78677 +    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
78678 +} arch_vcpu_info_t;
78679 +
78680 +#endif /* !__ASSEMBLY__ */
78681 +
78682 +/*
78683 + * Prefix forces emulation of some non-trapping instructions.
78684 + * Currently only CPUID.
78685 + */
78686 +#ifdef __ASSEMBLY__
78687 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
78688 +#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
78689 +#else
78690 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
78691 +#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
78692 +#endif
78693 +
78694 +#endif
78695 +
78696 +/*
78697 + * Local variables:
78698 + * mode: C
78699 + * c-set-style: "BSD"
78700 + * c-basic-offset: 4
78701 + * tab-width: 4
78702 + * indent-tabs-mode: nil
78703 + * End:
78704 + */
78705 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/COPYING tmp-linux-2.6-xen.patch/include/xen/interface/COPYING
78706 --- ref-linux-2.6.16.9/include/xen/interface/COPYING    1970-01-01 01:00:00.000000000 +0100
78707 +++ tmp-linux-2.6-xen.patch/include/xen/interface/COPYING       2006-04-10 00:05:57.000000000 +0200
78708 @@ -0,0 +1,28 @@
78709 +XEN NOTICE
78710 +==========
78711 +
78712 +This copyright applies to all files within this subdirectory. All
78713 +other files in the Xen source distribution are covered by version 2 of
78714 +the GNU General Public License.
78715 +
78716 + -- Keir Fraser (on behalf of the Xen team)
78717 +
78718 +=====================================================================
78719 +
78720 +Permission is hereby granted, free of charge, to any person obtaining a copy
78721 +of this software and associated documentation files (the "Software"), to
78722 +deal in the Software without restriction, including without limitation the
78723 +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
78724 +sell copies of the Software, and to permit persons to whom the Software is
78725 +furnished to do so, subject to the following conditions:
78726 +
78727 +The above copyright notice and this permission notice shall be included in
78728 +all copies or substantial portions of the Software.
78729 +
78730 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
78731 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
78732 +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
78733 +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
78734 +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
78735 +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
78736 +DEALINGS IN THE SOFTWARE.
78737 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/dom0_ops.h tmp-linux-2.6-xen.patch/include/xen/interface/dom0_ops.h
78738 --- ref-linux-2.6.16.9/include/xen/interface/dom0_ops.h 1970-01-01 01:00:00.000000000 +0100
78739 +++ tmp-linux-2.6-xen.patch/include/xen/interface/dom0_ops.h    2006-04-10 00:05:57.000000000 +0200
78740 @@ -0,0 +1,531 @@
78741 +/******************************************************************************
78742 + * dom0_ops.h
78743 + * 
78744 + * Process command requests from domain-0 guest OS.
78745 + * 
78746 + * Copyright (c) 2002-2003, B Dragovic
78747 + * Copyright (c) 2002-2004, K Fraser
78748 + */
78749 +
78750 +
78751 +#ifndef __XEN_PUBLIC_DOM0_OPS_H__
78752 +#define __XEN_PUBLIC_DOM0_OPS_H__
78753 +
78754 +#include "xen.h"
78755 +#include "sched_ctl.h"
78756 +
78757 +/*
78758 + * Make sure you increment the interface version whenever you modify this file!
78759 + * This makes sure that old versions of dom0 tools will stop working in a
78760 + * well-defined way (rather than crashing the machine, for instance).
78761 + */
78762 +#define DOM0_INTERFACE_VERSION   0x03000000
78763 +
78764 +/************************************************************************/
78765 +
78766 +#define DOM0_GETMEMLIST        2
78767 +typedef struct dom0_getmemlist {
78768 +    /* IN variables. */
78769 +    domid_t       domain;
78770 +    unsigned long max_pfns;
78771 +    GUEST_HANDLE(ulong) buffer;
78772 +    /* OUT variables. */
78773 +    unsigned long num_pfns;
78774 +} dom0_getmemlist_t;
78775 +DEFINE_GUEST_HANDLE(dom0_getmemlist_t);
78776 +
78777 +#define DOM0_SCHEDCTL          6
78778 + /* struct sched_ctl_cmd is from sched-ctl.h   */
78779 +typedef struct sched_ctl_cmd dom0_schedctl_t;
78780 +DEFINE_GUEST_HANDLE(dom0_schedctl_t);
78781 +
78782 +#define DOM0_ADJUSTDOM         7
78783 +/* struct sched_adjdom_cmd is from sched-ctl.h */
78784 +typedef struct sched_adjdom_cmd dom0_adjustdom_t;
78785 +DEFINE_GUEST_HANDLE(dom0_adjustdom_t);
78786 +
78787 +#define DOM0_CREATEDOMAIN      8
78788 +typedef struct dom0_createdomain {
78789 +    /* IN parameters */
78790 +    uint32_t ssidref;
78791 +    xen_domain_handle_t handle;
78792 +    /* IN/OUT parameters. */
78793 +    /* Identifier for new domain (auto-allocate if zero is specified). */
78794 +    domid_t domain;
78795 +} dom0_createdomain_t;
78796 +DEFINE_GUEST_HANDLE(dom0_createdomain_t);
78797 +
78798 +#define DOM0_DESTROYDOMAIN     9
78799 +typedef struct dom0_destroydomain {
78800 +    /* IN variables. */
78801 +    domid_t domain;
78802 +} dom0_destroydomain_t;
78803 +DEFINE_GUEST_HANDLE(dom0_destroydomain_t);
78804 +
78805 +#define DOM0_PAUSEDOMAIN      10
78806 +typedef struct dom0_pausedomain {
78807 +    /* IN parameters. */
78808 +    domid_t domain;
78809 +} dom0_pausedomain_t;
78810 +DEFINE_GUEST_HANDLE(dom0_pausedomain_t);
78811 +
78812 +#define DOM0_UNPAUSEDOMAIN    11
78813 +typedef struct dom0_unpausedomain {
78814 +    /* IN parameters. */
78815 +    domid_t domain;
78816 +} dom0_unpausedomain_t;
78817 +DEFINE_GUEST_HANDLE(dom0_unpausedomain_t);
78818 +
78819 +#define DOM0_GETDOMAININFO    12
78820 +typedef struct dom0_getdomaininfo {
78821 +    /* IN variables. */
78822 +    domid_t  domain;                  /* NB. IN/OUT variable. */
78823 +    /* OUT variables. */
78824 +#define DOMFLAGS_DYING     (1<<0) /* Domain is scheduled to die.             */
78825 +#define DOMFLAGS_SHUTDOWN  (1<<2) /* The guest OS has shut down.             */
78826 +#define DOMFLAGS_PAUSED    (1<<3) /* Currently paused by control software.   */
78827 +#define DOMFLAGS_BLOCKED   (1<<4) /* Currently blocked pending an event.     */
78828 +#define DOMFLAGS_RUNNING   (1<<5) /* Domain is currently running.            */
78829 +#define DOMFLAGS_CPUMASK      255 /* CPU to which this domain is bound.      */
78830 +#define DOMFLAGS_CPUSHIFT       8
78831 +#define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code.  */
78832 +#define DOMFLAGS_SHUTDOWNSHIFT 16
78833 +    uint32_t flags;
78834 +    unsigned long tot_pages;
78835 +    unsigned long max_pages;
78836 +    unsigned long shared_info_frame;       /* MFN of shared_info struct */
78837 +    uint64_t cpu_time;
78838 +    uint32_t nr_online_vcpus;     /* Number of VCPUs currently online. */
78839 +    uint32_t max_vcpu_id;         /* Maximum VCPUID in use by this domain. */
78840 +    uint32_t ssidref;
78841 +    xen_domain_handle_t handle;
78842 +} dom0_getdomaininfo_t;
78843 +DEFINE_GUEST_HANDLE(dom0_getdomaininfo_t);
78844 +
78845 +#define DOM0_SETVCPUCONTEXT   13
78846 +typedef struct dom0_setvcpucontext {
78847 +    /* IN variables. */
78848 +    domid_t               domain;
78849 +    uint32_t              vcpu;
78850 +    /* IN/OUT parameters */
78851 +    GUEST_HANDLE(vcpu_guest_context_t) ctxt;
78852 +} dom0_setvcpucontext_t;
78853 +DEFINE_GUEST_HANDLE(dom0_setvcpucontext_t);
78854 +
78855 +#define DOM0_MSR              15
78856 +typedef struct dom0_msr {
78857 +    /* IN variables. */
78858 +    uint32_t write;
78859 +    cpumap_t cpu_mask;
78860 +    uint32_t msr;
78861 +    uint32_t in1;
78862 +    uint32_t in2;
78863 +    /* OUT variables. */
78864 +    uint32_t out1;
78865 +    uint32_t out2;
78866 +} dom0_msr_t;
78867 +DEFINE_GUEST_HANDLE(dom0_msr_t);
78868 +
78869 +/*
78870 + * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
78871 + * 1 January, 1970 if the current system time was <system_time>.
78872 + */
78873 +#define DOM0_SETTIME          17
78874 +typedef struct dom0_settime {
78875 +    /* IN variables. */
78876 +    uint32_t secs;
78877 +    uint32_t nsecs;
78878 +    uint64_t system_time;
78879 +} dom0_settime_t;
78880 +DEFINE_GUEST_HANDLE(dom0_settime_t);
78881 +
78882 +#define DOM0_GETPAGEFRAMEINFO 18
78883 +#define LTAB_SHIFT 28
78884 +#define NOTAB 0         /* normal page */
78885 +#define L1TAB (1<<LTAB_SHIFT)
78886 +#define L2TAB (2<<LTAB_SHIFT)
78887 +#define L3TAB (3<<LTAB_SHIFT)
78888 +#define L4TAB (4<<LTAB_SHIFT)
78889 +#define LPINTAB  (1<<31)
78890 +#define XTAB  (0xf<<LTAB_SHIFT) /* invalid page */
78891 +#define LTAB_MASK XTAB
78892 +#define LTABTYPE_MASK (0x7<<LTAB_SHIFT)
78893 +
78894 +typedef struct dom0_getpageframeinfo {
78895 +    /* IN variables. */
78896 +    unsigned long mfn;     /* Machine page frame number to query.       */
78897 +    domid_t domain;        /* To which domain does the frame belong?    */
78898 +    /* OUT variables. */
78899 +    /* Is the page PINNED to a type? */
78900 +    uint32_t type;         /* see above type defs */
78901 +} dom0_getpageframeinfo_t;
78902 +DEFINE_GUEST_HANDLE(dom0_getpageframeinfo_t);
78903 +
78904 +/*
78905 + * Read console content from Xen buffer ring.
78906 + */
78907 +#define DOM0_READCONSOLE      19
78908 +typedef struct dom0_readconsole {
78909 +    /* IN variables. */
78910 +    uint32_t clear;            /* Non-zero -> clear after reading. */
78911 +    /* IN/OUT variables. */
78912 +    GUEST_HANDLE(char) buffer; /* In: Buffer start; Out: Used buffer start */
78913 +    uint32_t count;            /* In: Buffer size;  Out: Used buffer size  */
78914 +} dom0_readconsole_t;
78915 +DEFINE_GUEST_HANDLE(dom0_readconsole_t);
78916 +
78917 +/*
78918 + * Set which physical cpus a vcpu can execute on.
78919 + */
78920 +#define DOM0_SETVCPUAFFINITY  20
78921 +typedef struct dom0_setvcpuaffinity {
78922 +    /* IN variables. */
78923 +    domid_t   domain;
78924 +    uint32_t  vcpu;
78925 +    cpumap_t  cpumap;
78926 +} dom0_setvcpuaffinity_t;
78927 +DEFINE_GUEST_HANDLE(dom0_setvcpuaffinity_t);
78928 +
78929 +/* Get trace buffers machine base address */
78930 +#define DOM0_TBUFCONTROL       21
78931 +typedef struct dom0_tbufcontrol {
78932 +    /* IN variables */
78933 +#define DOM0_TBUF_GET_INFO     0
78934 +#define DOM0_TBUF_SET_CPU_MASK 1
78935 +#define DOM0_TBUF_SET_EVT_MASK 2
78936 +#define DOM0_TBUF_SET_SIZE     3
78937 +#define DOM0_TBUF_ENABLE       4
78938 +#define DOM0_TBUF_DISABLE      5
78939 +    uint32_t      op;
78940 +    /* IN/OUT variables */
78941 +    cpumap_t      cpu_mask;
78942 +    uint32_t      evt_mask;
78943 +    /* OUT variables */
78944 +    unsigned long buffer_mfn;
78945 +    uint32_t size;
78946 +} dom0_tbufcontrol_t;
78947 +DEFINE_GUEST_HANDLE(dom0_tbufcontrol_t);
78948 +
78949 +/*
78950 + * Get physical information about the host machine
78951 + */
78952 +#define DOM0_PHYSINFO         22
78953 +typedef struct dom0_physinfo {
78954 +    uint32_t threads_per_core;
78955 +    uint32_t cores_per_socket;
78956 +    uint32_t sockets_per_node;
78957 +    uint32_t nr_nodes;
78958 +    uint32_t cpu_khz;
78959 +    unsigned long total_pages;
78960 +    unsigned long free_pages;
78961 +    uint32_t hw_cap[8];
78962 +} dom0_physinfo_t;
78963 +DEFINE_GUEST_HANDLE(dom0_physinfo_t);
78964 +
78965 +/*
78966 + * Get the ID of the current scheduler.
78967 + */
78968 +#define DOM0_SCHED_ID        24
78969 +typedef struct dom0_sched_id {
78970 +    /* OUT variable */
78971 +    uint32_t sched_id;
78972 +} dom0_sched_id_t;
78973 +DEFINE_GUEST_HANDLE(dom0_sched_id_t);
78974 +
78975 +/*
78976 + * Control shadow pagetables operation
78977 + */
78978 +#define DOM0_SHADOW_CONTROL  25
78979 +
78980 +#define DOM0_SHADOW_CONTROL_OP_OFF         0
78981 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TEST 1
78982 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY 2
78983 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE 3
78984 +
78985 +#define DOM0_SHADOW_CONTROL_OP_FLUSH       10     /* table ops */
78986 +#define DOM0_SHADOW_CONTROL_OP_CLEAN       11
78987 +#define DOM0_SHADOW_CONTROL_OP_PEEK        12
78988 +
78989 +typedef struct dom0_shadow_control_stats {
78990 +    uint32_t fault_count;
78991 +    uint32_t dirty_count;
78992 +    uint32_t dirty_net_count;
78993 +    uint32_t dirty_block_count;
78994 +} dom0_shadow_control_stats_t;
78995 +DEFINE_GUEST_HANDLE(dom0_shadow_control_stats_t);
78996 +
78997 +typedef struct dom0_shadow_control {
78998 +    /* IN variables. */
78999 +    domid_t        domain;
79000 +    uint32_t       op;
79001 +    GUEST_HANDLE(ulong) dirty_bitmap;
79002 +    /* IN/OUT variables. */
79003 +    unsigned long  pages;        /* size of buffer, updated with actual size */
79004 +    /* OUT variables. */
79005 +    dom0_shadow_control_stats_t stats;
79006 +} dom0_shadow_control_t;
79007 +DEFINE_GUEST_HANDLE(dom0_shadow_control_t);
79008 +
79009 +#define DOM0_SETDOMAINMAXMEM   28
79010 +typedef struct dom0_setdomainmaxmem {
79011 +    /* IN variables. */
79012 +    domid_t       domain;
79013 +    unsigned long max_memkb;
79014 +} dom0_setdomainmaxmem_t;
79015 +DEFINE_GUEST_HANDLE(dom0_setdomainmaxmem_t);
79016 +
79017 +#define DOM0_GETPAGEFRAMEINFO2 29   /* batched interface */
79018 +typedef struct dom0_getpageframeinfo2 {
79019 +    /* IN variables. */
79020 +    domid_t        domain;
79021 +    unsigned long  num;
79022 +    /* IN/OUT variables. */
79023 +    GUEST_HANDLE(ulong) array;
79024 +} dom0_getpageframeinfo2_t;
79025 +DEFINE_GUEST_HANDLE(dom0_getpageframeinfo2_t);
79026 +
79027 +/*
79028 + * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
79029 + * On x86, @type is an architecture-defined MTRR memory type.
79030 + * On success, returns the MTRR that was used (@reg) and a handle that can
79031 + * be passed to DOM0_DEL_MEMTYPE to accurately tear down the new setting.
79032 + * (x86-specific).
79033 + */
79034 +#define DOM0_ADD_MEMTYPE         31
79035 +typedef struct dom0_add_memtype {
79036 +    /* IN variables. */
79037 +    unsigned long mfn;
79038 +    unsigned long nr_mfns;
79039 +    uint32_t      type;
79040 +    /* OUT variables. */
79041 +    uint32_t      handle;
79042 +    uint32_t      reg;
79043 +} dom0_add_memtype_t;
79044 +DEFINE_GUEST_HANDLE(dom0_add_memtype_t);
79045 +
79046 +/*
79047 + * Tear down an existing memory-range type. If @handle is remembered then it
79048 + * should be passed in to accurately tear down the correct setting (in case
79049 + * of overlapping memory regions with differing types). If it is not known
79050 + * then @handle should be set to zero. In all cases @reg must be set.
79051 + * (x86-specific).
79052 + */
79053 +#define DOM0_DEL_MEMTYPE         32
79054 +typedef struct dom0_del_memtype {
79055 +    /* IN variables. */
79056 +    uint32_t handle;
79057 +    uint32_t reg;
79058 +} dom0_del_memtype_t;
79059 +DEFINE_GUEST_HANDLE(dom0_del_memtype_t);
79060 +
79061 +/* Read current type of an MTRR (x86-specific). */
79062 +#define DOM0_READ_MEMTYPE        33
79063 +typedef struct dom0_read_memtype {
79064 +    /* IN variables. */
79065 +    uint32_t reg;
79066 +    /* OUT variables. */
79067 +    unsigned long mfn;
79068 +    unsigned long nr_mfns;
79069 +    uint32_t type;
79070 +} dom0_read_memtype_t;
79071 +DEFINE_GUEST_HANDLE(dom0_read_memtype_t);
79072 +
79073 +/* Interface for controlling Xen software performance counters. */
79074 +#define DOM0_PERFCCONTROL        34
79075 +/* Sub-operations: */
79076 +#define DOM0_PERFCCONTROL_OP_RESET 1   /* Reset all counters to zero. */
79077 +#define DOM0_PERFCCONTROL_OP_QUERY 2   /* Get perfctr information. */
79078 +typedef struct dom0_perfc_desc {
79079 +    char         name[80];             /* name of perf counter */
79080 +    uint32_t     nr_vals;              /* number of values for this counter */
79081 +    uint32_t     vals[64];             /* array of values */
79082 +} dom0_perfc_desc_t;
79083 +DEFINE_GUEST_HANDLE(dom0_perfc_desc_t);
79084 +typedef struct dom0_perfccontrol {
79085 +    /* IN variables. */
79086 +    uint32_t       op;                /*  DOM0_PERFCCONTROL_OP_??? */
79087 +    /* OUT variables. */
79088 +    uint32_t       nr_counters;       /*  number of counters */
79089 +    GUEST_HANDLE(dom0_perfc_desc_t) desc; /*  counter information (or NULL) */
79090 +} dom0_perfccontrol_t;
79091 +DEFINE_GUEST_HANDLE(dom0_perfccontrol_t);
79092 +
79093 +#define DOM0_MICROCODE           35
79094 +typedef struct dom0_microcode {
79095 +    /* IN variables. */
79096 +    GUEST_HANDLE(void) data;          /* Pointer to microcode data */
79097 +    uint32_t length;                  /* Length of microcode data. */
79098 +} dom0_microcode_t;
79099 +DEFINE_GUEST_HANDLE(dom0_microcode_t);
79100 +
79101 +#define DOM0_IOPORT_PERMISSION   36
79102 +typedef struct dom0_ioport_permission {
79103 +    domid_t  domain;                  /* domain to be affected */
79104 +    uint32_t first_port;              /* first port int range */
79105 +    uint32_t nr_ports;                /* size of port range */
79106 +    uint8_t  allow_access;            /* allow or deny access to range? */
79107 +} dom0_ioport_permission_t;
79108 +DEFINE_GUEST_HANDLE(dom0_ioport_permission_t);
79109 +
79110 +#define DOM0_GETVCPUCONTEXT      37
79111 +typedef struct dom0_getvcpucontext {
79112 +    /* IN variables. */
79113 +    domid_t  domain;                  /* domain to be affected */
79114 +    uint32_t vcpu;                    /* vcpu # */
79115 +    /* OUT variables. */
79116 +    GUEST_HANDLE(vcpu_guest_context_t) ctxt;
79117 +} dom0_getvcpucontext_t;
79118 +DEFINE_GUEST_HANDLE(dom0_getvcpucontext_t);
79119 +
79120 +#define DOM0_GETVCPUINFO         43
79121 +typedef struct dom0_getvcpuinfo {
79122 +    /* IN variables. */
79123 +    domid_t  domain;                  /* domain to be affected */
79124 +    uint32_t vcpu;                    /* vcpu # */
79125 +    /* OUT variables. */
79126 +    uint8_t  online;                  /* currently online (not hotplugged)? */
79127 +    uint8_t  blocked;                 /* blocked waiting for an event? */
79128 +    uint8_t  running;                 /* currently scheduled on its CPU? */
79129 +    uint64_t cpu_time;                /* total cpu time consumed (ns) */
79130 +    uint32_t cpu;                     /* current mapping   */
79131 +    cpumap_t cpumap;                  /* allowable mapping */
79132 +} dom0_getvcpuinfo_t;
79133 +DEFINE_GUEST_HANDLE(dom0_getvcpuinfo_t);
79134 +
79135 +#define DOM0_GETDOMAININFOLIST   38
79136 +typedef struct dom0_getdomaininfolist {
79137 +    /* IN variables. */
79138 +    domid_t               first_domain;
79139 +    uint32_t              max_domains;
79140 +    GUEST_HANDLE(dom0_getdomaininfo_t) buffer;
79141 +    /* OUT variables. */
79142 +    uint32_t              num_domains;
79143 +} dom0_getdomaininfolist_t;
79144 +DEFINE_GUEST_HANDLE(dom0_getdomaininfolist_t);
79145 +
79146 +#define DOM0_PLATFORM_QUIRK      39
79147 +#define QUIRK_NOIRQBALANCING  1
79148 +typedef struct dom0_platform_quirk {
79149 +    /* IN variables. */
79150 +    uint32_t quirk_id;
79151 +} dom0_platform_quirk_t;
79152 +DEFINE_GUEST_HANDLE(dom0_platform_quirk_t);
79153 +
79154 +#define DOM0_PHYSICAL_MEMORY_MAP 40
79155 +typedef struct dom0_memory_map_entry {
79156 +    uint64_t start, end;
79157 +    uint32_t flags; /* reserved */
79158 +    uint8_t  is_ram;
79159 +} dom0_memory_map_entry_t;
79160 +DEFINE_GUEST_HANDLE(dom0_memory_map_entry_t);
79161 +typedef struct dom0_physical_memory_map {
79162 +    /* IN variables. */
79163 +    uint32_t max_map_entries;
79164 +    /* OUT variables. */
79165 +    uint32_t nr_map_entries;
79166 +    GUEST_HANDLE(dom0_memory_map_entry_t) memory_map;
79167 +} dom0_physical_memory_map_t;
79168 +DEFINE_GUEST_HANDLE(dom0_physical_memory_map_t);
79169 +
79170 +#define DOM0_MAX_VCPUS 41
79171 +typedef struct dom0_max_vcpus {
79172 +    domid_t  domain;        /* domain to be affected */
79173 +    uint32_t max;           /* maximum number of vcpus */
79174 +} dom0_max_vcpus_t;
79175 +DEFINE_GUEST_HANDLE(dom0_max_vcpus_t);
79176 +
79177 +#define DOM0_SETDOMAINHANDLE 44
79178 +typedef struct dom0_setdomainhandle {
79179 +    domid_t domain;
79180 +    xen_domain_handle_t handle;
79181 +} dom0_setdomainhandle_t;
79182 +DEFINE_GUEST_HANDLE(dom0_setdomainhandle_t);
79183 +
79184 +#define DOM0_SETDEBUGGING 45
79185 +typedef struct dom0_setdebugging {
79186 +    domid_t domain;
79187 +    uint8_t enable;
79188 +} dom0_setdebugging_t;
79189 +DEFINE_GUEST_HANDLE(dom0_setdebugging_t);
79190 +
79191 +#define DOM0_IRQ_PERMISSION 46
79192 +typedef struct dom0_irq_permission {
79193 +    domid_t domain;          /* domain to be affected */
79194 +    uint8_t pirq;
79195 +    uint8_t allow_access;    /* flag to specify enable/disable of IRQ access */
79196 +} dom0_irq_permission_t;
79197 +DEFINE_GUEST_HANDLE(dom0_irq_permission_t);
79198 +
79199 +#define DOM0_IOMEM_PERMISSION 47
79200 +typedef struct dom0_iomem_permission {
79201 +    domid_t  domain;          /* domain to be affected */
79202 +    unsigned long first_mfn;  /* first page (physical page number) in range */
79203 +    unsigned long nr_mfns;    /* number of pages in range (>0) */
79204 +    uint8_t allow_access;     /* allow (!0) or deny (0) access to range? */
79205 +} dom0_iomem_permission_t;
79206 +DEFINE_GUEST_HANDLE(dom0_iomem_permission_t);
79207 +
79208 +#define DOM0_HYPERCALL_INIT   48
79209 +typedef struct dom0_hypercall_init {
79210 +    domid_t  domain;          /* domain to be affected */
79211 +    unsigned long mfn;        /* machine frame to be initialised */
79212 +} dom0_hypercall_init_t;
79213 +DEFINE_GUEST_HANDLE(dom0_hypercall_init_t);
79214 +
79215 +typedef struct dom0_op {
79216 +    uint32_t cmd;
79217 +    uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
79218 +    union {
79219 +        struct dom0_createdomain      createdomain;
79220 +        struct dom0_pausedomain       pausedomain;
79221 +        struct dom0_unpausedomain     unpausedomain;
79222 +        struct dom0_destroydomain     destroydomain;
79223 +        struct dom0_getmemlist        getmemlist;
79224 +        struct sched_ctl_cmd          schedctl;
79225 +        struct sched_adjdom_cmd       adjustdom;
79226 +        struct dom0_setvcpucontext    setvcpucontext;
79227 +        struct dom0_getdomaininfo     getdomaininfo;
79228 +        struct dom0_getpageframeinfo  getpageframeinfo;
79229 +        struct dom0_msr               msr;
79230 +        struct dom0_settime           settime;
79231 +        struct dom0_readconsole       readconsole;
79232 +        struct dom0_setvcpuaffinity   setvcpuaffinity;
79233 +        struct dom0_tbufcontrol       tbufcontrol;
79234 +        struct dom0_physinfo          physinfo;
79235 +        struct dom0_sched_id          sched_id;
79236 +        struct dom0_shadow_control    shadow_control;
79237 +        struct dom0_setdomainmaxmem   setdomainmaxmem;
79238 +        struct dom0_getpageframeinfo2 getpageframeinfo2;
79239 +        struct dom0_add_memtype       add_memtype;
79240 +        struct dom0_del_memtype       del_memtype;
79241 +        struct dom0_read_memtype      read_memtype;
79242 +        struct dom0_perfccontrol      perfccontrol;
79243 +        struct dom0_microcode         microcode;
79244 +        struct dom0_ioport_permission ioport_permission;
79245 +        struct dom0_getvcpucontext    getvcpucontext;
79246 +        struct dom0_getvcpuinfo       getvcpuinfo;
79247 +        struct dom0_getdomaininfolist getdomaininfolist;
79248 +        struct dom0_platform_quirk    platform_quirk;
79249 +        struct dom0_physical_memory_map physical_memory_map;
79250 +        struct dom0_max_vcpus         max_vcpus;
79251 +        struct dom0_setdomainhandle   setdomainhandle;
79252 +        struct dom0_setdebugging      setdebugging;
79253 +        struct dom0_irq_permission    irq_permission;
79254 +        struct dom0_iomem_permission  iomem_permission;
79255 +        struct dom0_hypercall_init    hypercall_init;
79256 +        uint8_t                       pad[128];
79257 +    } u;
79258 +} dom0_op_t;
79259 +DEFINE_GUEST_HANDLE(dom0_op_t);
79260 +
79261 +#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
79262 +
79263 +/*
79264 + * Local variables:
79265 + * mode: C
79266 + * c-set-style: "BSD"
79267 + * c-basic-offset: 4
79268 + * tab-width: 4
79269 + * indent-tabs-mode: nil
79270 + * End:
79271 + */
79272 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/event_channel.h tmp-linux-2.6-xen.patch/include/xen/interface/event_channel.h
79273 --- ref-linux-2.6.16.9/include/xen/interface/event_channel.h    1970-01-01 01:00:00.000000000 +0100
79274 +++ tmp-linux-2.6-xen.patch/include/xen/interface/event_channel.h       2006-04-10 00:05:57.000000000 +0200
79275 @@ -0,0 +1,205 @@
79276 +/******************************************************************************
79277 + * event_channel.h
79278 + * 
79279 + * Event channels between domains.
79280 + * 
79281 + * Copyright (c) 2003-2004, K A Fraser.
79282 + */
79283 +
79284 +#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
79285 +#define __XEN_PUBLIC_EVENT_CHANNEL_H__
79286 +
79287 +typedef uint32_t evtchn_port_t;
79288 +DEFINE_GUEST_HANDLE(evtchn_port_t);
79289 +
79290 +/*
79291 + * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
79292 + * accepting interdomain bindings from domain <remote_dom>. A fresh port
79293 + * is allocated in <dom> and returned as <port>.
79294 + * NOTES:
79295 + *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
79296 + *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
79297 + */
79298 +#define EVTCHNOP_alloc_unbound    6
79299 +typedef struct evtchn_alloc_unbound {
79300 +    /* IN parameters */
79301 +    domid_t dom, remote_dom;
79302 +    /* OUT parameters */
79303 +    evtchn_port_t port;
79304 +} evtchn_alloc_unbound_t;
79305 +
79306 +/*
79307 + * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
79308 + * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
79309 + * a port that is unbound and marked as accepting bindings from the calling
79310 + * domain. A fresh port is allocated in the calling domain and returned as
79311 + * <local_port>.
79312 + * NOTES:
79313 + *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
79314 + */
79315 +#define EVTCHNOP_bind_interdomain 0
79316 +typedef struct evtchn_bind_interdomain {
79317 +    /* IN parameters. */
79318 +    domid_t remote_dom;
79319 +    evtchn_port_t remote_port;
79320 +    /* OUT parameters. */
79321 +    evtchn_port_t local_port;
79322 +} evtchn_bind_interdomain_t;
79323 +
79324 +/*
79325 + * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
79326 + * vcpu.
79327 + * NOTES:
79328 + *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
79329 + *  2. The allocated event channel is bound to the specified vcpu. The binding
79330 + *     may not be changed.
79331 + */
79332 +#define EVTCHNOP_bind_virq        1
79333 +typedef struct evtchn_bind_virq {
79334 +    /* IN parameters. */
79335 +    uint32_t virq;
79336 +    uint32_t vcpu;
79337 +    /* OUT parameters. */
79338 +    evtchn_port_t port;
79339 +} evtchn_bind_virq_t;
79340 +
79341 +/*
79342 + * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
79343 + * NOTES:
79344 + *  1. A physical IRQ may be bound to at most one event channel per domain.
79345 + *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
79346 + */
79347 +#define EVTCHNOP_bind_pirq        2
79348 +typedef struct evtchn_bind_pirq {
79349 +    /* IN parameters. */
79350 +    uint32_t pirq;
79351 +#define BIND_PIRQ__WILL_SHARE 1
79352 +    uint32_t flags; /* BIND_PIRQ__* */
79353 +    /* OUT parameters. */
79354 +    evtchn_port_t port;
79355 +} evtchn_bind_pirq_t;
79356 +
79357 +/*
79358 + * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
79359 + * NOTES:
79360 + *  1. The allocated event channel is bound to the specified vcpu. The binding
79361 + *     may not be changed.
79362 + */
79363 +#define EVTCHNOP_bind_ipi         7
79364 +typedef struct evtchn_bind_ipi {
79365 +    uint32_t vcpu;
79366 +    /* OUT parameters. */
79367 +    evtchn_port_t port;
79368 +} evtchn_bind_ipi_t;
79369 +
79370 +/*
79371 + * EVTCHNOP_close: Close a local event channel <port>. If the channel is
79372 + * interdomain then the remote end is placed in the unbound state
79373 + * (EVTCHNSTAT_unbound), awaiting a new connection.
79374 + */
79375 +#define EVTCHNOP_close            3
79376 +typedef struct evtchn_close {
79377 +    /* IN parameters. */
79378 +    evtchn_port_t port;
79379 +} evtchn_close_t;
79380 +
79381 +/*
79382 + * EVTCHNOP_send: Send an event to the remote end of the channel whose local
79383 + * endpoint is <port>.
79384 + */
79385 +#define EVTCHNOP_send             4
79386 +typedef struct evtchn_send {
79387 +    /* IN parameters. */
79388 +    evtchn_port_t port;
79389 +} evtchn_send_t;
79390 +
79391 +/*
79392 + * EVTCHNOP_status: Get the current status of the communication channel which
79393 + * has an endpoint at <dom, port>.
79394 + * NOTES:
79395 + *  1. <dom> may be specified as DOMID_SELF.
79396 + *  2. Only a sufficiently-privileged domain may obtain the status of an event
79397 + *     channel for which <dom> is not DOMID_SELF.
79398 + */
79399 +#define EVTCHNOP_status           5
79400 +typedef struct evtchn_status {
79401 +    /* IN parameters */
79402 +    domid_t  dom;
79403 +    evtchn_port_t port;
79404 +    /* OUT parameters */
79405 +#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
79406 +#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
79407 +#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
79408 +#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
79409 +#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
79410 +#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
79411 +    uint32_t status;
79412 +    uint32_t vcpu;                 /* VCPU to which this channel is bound.   */
79413 +    union {
79414 +        struct {
79415 +            domid_t dom;
79416 +        } unbound; /* EVTCHNSTAT_unbound */
79417 +        struct {
79418 +            domid_t dom;
79419 +            evtchn_port_t port;
79420 +        } interdomain; /* EVTCHNSTAT_interdomain */
79421 +        uint32_t pirq;      /* EVTCHNSTAT_pirq        */
79422 +        uint32_t virq;      /* EVTCHNSTAT_virq        */
79423 +    } u;
79424 +} evtchn_status_t;
79425 +
79426 +/*
79427 + * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
79428 + * event is pending.
79429 + * NOTES:
79430 + *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
79431 + *     the binding. This binding cannot be changed.
79432 + *  2. All other channels notify vcpu0 by default. This default is set when
79433 + *     the channel is allocated (a port that is freed and subsequently reused
79434 + *     has its binding reset to vcpu0).
79435 + */
79436 +#define EVTCHNOP_bind_vcpu        8
79437 +typedef struct evtchn_bind_vcpu {
79438 +    /* IN parameters. */
79439 +    evtchn_port_t port;
79440 +    uint32_t vcpu;
79441 +} evtchn_bind_vcpu_t;
79442 +
79443 +/*
79444 + * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
79445 + * a notification to the appropriate VCPU if an event is pending.
79446 + */
79447 +#define EVTCHNOP_unmask           9
79448 +typedef struct evtchn_unmask {
79449 +    /* IN parameters. */
79450 +    evtchn_port_t port;
79451 +} evtchn_unmask_t;
79452 +
79453 +typedef struct evtchn_op {
79454 +    uint32_t cmd; /* EVTCHNOP_* */
79455 +    union {
79456 +        evtchn_alloc_unbound_t    alloc_unbound;
79457 +        evtchn_bind_interdomain_t bind_interdomain;
79458 +        evtchn_bind_virq_t        bind_virq;
79459 +        evtchn_bind_pirq_t        bind_pirq;
79460 +        evtchn_bind_ipi_t         bind_ipi;
79461 +        evtchn_close_t            close;
79462 +        evtchn_send_t             send;
79463 +        evtchn_status_t           status;
79464 +        evtchn_bind_vcpu_t        bind_vcpu;
79465 +        evtchn_unmask_t           unmask;
79466 +    } u;
79467 +} evtchn_op_t;
79468 +DEFINE_GUEST_HANDLE(evtchn_op_t);
79469 +
79470 +#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
79471 +
79472 +/*
79473 + * Local variables:
79474 + * mode: C
79475 + * c-set-style: "BSD"
79476 + * c-basic-offset: 4
79477 + * tab-width: 4
79478 + * indent-tabs-mode: nil
79479 + * End:
79480 + */
79481 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/features.h tmp-linux-2.6-xen.patch/include/xen/interface/features.h
79482 --- ref-linux-2.6.16.9/include/xen/interface/features.h 1970-01-01 01:00:00.000000000 +0100
79483 +++ tmp-linux-2.6-xen.patch/include/xen/interface/features.h    2006-04-10 00:05:57.000000000 +0200
79484 @@ -0,0 +1,53 @@
79485 +/******************************************************************************
79486 + * features.h
79487 + * 
79488 + * Feature flags, reported by XENVER_get_features.
79489 + * 
79490 + * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
79491 + */
79492 +
79493 +#ifndef __XEN_PUBLIC_FEATURES_H__
79494 +#define __XEN_PUBLIC_FEATURES_H__
79495 +
79496 +/*
79497 + * If set, the guest does not need to write-protect its pagetables, and can
79498 + * update them via direct writes.
79499 + */
79500 +#define XENFEAT_writable_page_tables       0
79501 +
79502 +/*
79503 + * If set, the guest does not need to write-protect its segment descriptor
79504 + * tables, and can update them via direct writes.
79505 + */
79506 +#define XENFEAT_writable_descriptor_tables 1
79507 +
79508 +/*
79509 + * If set, translation between the guest's 'pseudo-physical' address space
79510 + * and the host's machine address space are handled by the hypervisor. In this
79511 + * mode the guest does not need to perform phys-to/from-machine translations
79512 + * when performing page table operations.
79513 + */
79514 +#define XENFEAT_auto_translated_physmap    2
79515 +
79516 +/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
79517 +#define XENFEAT_supervisor_mode_kernel     3
79518 +
79519 +/*
79520 + * If set, the guest does not need to allocate x86 PAE page directories
79521 + * below 4GB. This flag is usually implied by auto_translated_physmap.
79522 + */
79523 +#define XENFEAT_pae_pgdir_above_4gb        4
79524 +
79525 +#define XENFEAT_NR_SUBMAPS 1
79526 +
79527 +#endif /* __XEN_PUBLIC_FEATURES_H__ */
79528 +
79529 +/*
79530 + * Local variables:
79531 + * mode: C
79532 + * c-set-style: "BSD"
79533 + * c-basic-offset: 4
79534 + * tab-width: 4
79535 + * indent-tabs-mode: nil
79536 + * End:
79537 + */
79538 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/grant_table.h tmp-linux-2.6-xen.patch/include/xen/interface/grant_table.h
79539 --- ref-linux-2.6.16.9/include/xen/interface/grant_table.h      1970-01-01 01:00:00.000000000 +0100
79540 +++ tmp-linux-2.6-xen.patch/include/xen/interface/grant_table.h 2006-04-10 00:05:57.000000000 +0200
79541 @@ -0,0 +1,311 @@
79542 +/******************************************************************************
79543 + * grant_table.h
79544 + * 
79545 + * Interface for granting foreign access to page frames, and receiving
79546 + * page-ownership transfers.
79547 + * 
79548 + * Copyright (c) 2004, K A Fraser
79549 + */
79550 +
79551 +#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
79552 +#define __XEN_PUBLIC_GRANT_TABLE_H__
79553 +
79554 +
79555 +/***********************************
79556 + * GRANT TABLE REPRESENTATION
79557 + */
79558 +
79559 +/* Some rough guidelines on accessing and updating grant-table entries
79560 + * in a concurrency-safe manner. For more information, Linux contains a
79561 + * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
79562 + * 
79563 + * NB. WMB is a no-op on current-generation x86 processors. However, a
79564 + *     compiler barrier will still be required.
79565 + * 
79566 + * Introducing a valid entry into the grant table:
79567 + *  1. Write ent->domid.
79568 + *  2. Write ent->frame:
79569 + *      GTF_permit_access:   Frame to which access is permitted.
79570 + *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
79571 + *                           frame, or zero if none.
79572 + *  3. Write memory barrier (WMB).
79573 + *  4. Write ent->flags, inc. valid type.
79574 + * 
79575 + * Invalidating an unused GTF_permit_access entry:
79576 + *  1. flags = ent->flags.
79577 + *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
79578 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
79579 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
79580 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
79581 + *
79582 + * Invalidating an in-use GTF_permit_access entry:
79583 + *  This cannot be done directly. Request assistance from the domain controller
79584 + *  which can set a timeout on the use of a grant entry and take necessary
79585 + *  action. (NB. This is not yet implemented!).
79586 + * 
79587 + * Invalidating an unused GTF_accept_transfer entry:
79588 + *  1. flags = ent->flags.
79589 + *  2. Observe that !(flags & GTF_transfer_committed). [*]
79590 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
79591 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
79592 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
79593 + *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
79594 + *      The guest must /not/ modify the grant entry until the address of the
79595 + *      transferred frame is written. It is safe for the guest to spin waiting
79596 + *      for this to occur (detect by observing GTF_transfer_completed in
79597 + *      ent->flags).
79598 + *
79599 + * Invalidating a committed GTF_accept_transfer entry:
79600 + *  1. Wait for (ent->flags & GTF_transfer_completed).
79601 + *
79602 + * Changing a GTF_permit_access from writable to read-only:
79603 + *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
79604 + * 
79605 + * Changing a GTF_permit_access from read-only to writable:
79606 + *  Use SMP-safe bit-setting instruction.
79607 + */
79608 +
79609 +/*
79610 + * A grant table comprises a packed array of grant entries in one or more
79611 + * page frames shared between Xen and a guest.
79612 + * [XEN]: This field is written by Xen and read by the sharing guest.
79613 + * [GST]: This field is written by the guest and read by Xen.
79614 + */
79615 +typedef struct grant_entry {
79616 +    /* GTF_xxx: various type and flag information.  [XEN,GST] */
79617 +    uint16_t flags;
79618 +    /* The domain being granted foreign privileges. [GST] */
79619 +    domid_t  domid;
79620 +    /*
79621 +     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
79622 +     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
79623 +     */
79624 +    uint32_t frame;
79625 +} grant_entry_t;
79626 +
79627 +/*
79628 + * Type of grant entry.
79629 + *  GTF_invalid: This grant entry grants no privileges.
79630 + *  GTF_permit_access: Allow @domid to map/access @frame.
79631 + *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
79632 + *                       to this guest. Xen writes the page number to @frame.
79633 + */
79634 +#define GTF_invalid         (0U<<0)
79635 +#define GTF_permit_access   (1U<<0)
79636 +#define GTF_accept_transfer (2U<<0)
79637 +#define GTF_type_mask       (3U<<0)
79638 +
79639 +/*
79640 + * Subflags for GTF_permit_access.
79641 + *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
79642 + *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
79643 + *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
79644 + */
79645 +#define _GTF_readonly       (2)
79646 +#define GTF_readonly        (1U<<_GTF_readonly)
79647 +#define _GTF_reading        (3)
79648 +#define GTF_reading         (1U<<_GTF_reading)
79649 +#define _GTF_writing        (4)
79650 +#define GTF_writing         (1U<<_GTF_writing)
79651 +
79652 +/*
79653 + * Subflags for GTF_accept_transfer:
79654 + *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
79655 + *      to transferring ownership of a page frame. When a guest sees this flag
79656 + *      it must /not/ modify the grant entry until GTF_transfer_completed is
79657 + *      set by Xen.
79658 + *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
79659 + *      after reading GTF_transfer_committed. Xen will always write the frame
79660 + *      address, followed by ORing this flag, in a timely manner.
79661 + */
79662 +#define _GTF_transfer_committed (2)
79663 +#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
79664 +#define _GTF_transfer_completed (3)
79665 +#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
79666 +
79667 +
79668 +/***********************************
79669 + * GRANT TABLE QUERIES AND USES
79670 + */
79671 +
79672 +/*
79673 + * Reference to a grant entry in a specified domain's grant table.
79674 + */
79675 +typedef uint32_t grant_ref_t;
79676 +
79677 +/*
79678 + * Handle to track a mapping created via a grant reference.
79679 + */
79680 +typedef uint32_t grant_handle_t;
79681 +
79682 +/*
79683 + * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
79684 + * by devices and/or host CPUs. If successful, <handle> is a tracking number
79685 + * that must be presented later to destroy the mapping(s). On error, <handle>
79686 + * is a negative status code.
79687 + * NOTES:
79688 + *  1. If GNTPIN_map_for_dev is specified then <dev_bus_addr> is the address
79689 + *     via which I/O devices may access the granted frame.
79690 + *  2. If GNTPIN_map_for_host is specified then a mapping will be added at
79691 + *     either a host virtual address in the current address space, or at
79692 + *     a PTE at the specified machine address.  The type of mapping to
79693 + *     perform is selected through the GNTMAP_contains_pte flag, and the 
79694 + *     address is specified in <host_addr>.
79695 + *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
79696 + *     host mapping is destroyed by other means then it is *NOT* guaranteed
79697 + *     to be accounted to the correct grant reference!
79698 + */
79699 +#define GNTTABOP_map_grant_ref        0
79700 +typedef struct gnttab_map_grant_ref {
79701 +    /* IN parameters. */
79702 +    uint64_t host_addr;
79703 +    uint32_t flags;               /* GNTMAP_* */
79704 +    grant_ref_t ref;
79705 +    domid_t  dom;
79706 +    /* OUT parameters. */
79707 +    int16_t  status;              /* GNTST_* */
79708 +    grant_handle_t handle;
79709 +    uint64_t dev_bus_addr;
79710 +} gnttab_map_grant_ref_t;
79711 +DEFINE_GUEST_HANDLE(gnttab_map_grant_ref_t);
79712 +
79713 +/*
79714 + * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
79715 + * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
79716 + * field is ignored. If non-zero, they must refer to a device/host mapping
79717 + * that is tracked by <handle>
79718 + * NOTES:
79719 + *  1. The call may fail in an undefined manner if either mapping is not
79720 + *     tracked by <handle>.
79721 + *  3. After executing a batch of unmaps, it is guaranteed that no stale
79722 + *     mappings will remain in the device or host TLBs.
79723 + */
79724 +#define GNTTABOP_unmap_grant_ref      1
79725 +typedef struct gnttab_unmap_grant_ref {
79726 +    /* IN parameters. */
79727 +    uint64_t host_addr;
79728 +    uint64_t dev_bus_addr;
79729 +    grant_handle_t handle;
79730 +    /* OUT parameters. */
79731 +    int16_t  status;              /* GNTST_* */
79732 +} gnttab_unmap_grant_ref_t;
79733 +DEFINE_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
79734 +
79735 +/*
79736 + * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
79737 + * <nr_frames> pages. The frame addresses are written to the <frame_list>.
79738 + * Only <nr_frames> addresses are written, even if the table is larger.
79739 + * NOTES:
79740 + *  1. <dom> may be specified as DOMID_SELF.
79741 + *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
79742 + *  3. Xen may not support more than a single grant-table page per domain.
79743 + */
79744 +#define GNTTABOP_setup_table          2
79745 +typedef struct gnttab_setup_table {
79746 +    /* IN parameters. */
79747 +    domid_t  dom;
79748 +    uint32_t nr_frames;
79749 +    /* OUT parameters. */
79750 +    int16_t  status;              /* GNTST_* */
79751 +    GUEST_HANDLE(ulong) frame_list;
79752 +} gnttab_setup_table_t;
79753 +DEFINE_GUEST_HANDLE(gnttab_setup_table_t);
79754 +
79755 +/*
79756 + * GNTTABOP_dump_table: Dump the contents of the grant table to the
79757 + * xen console. Debugging use only.
79758 + */
79759 +#define GNTTABOP_dump_table           3
79760 +typedef struct gnttab_dump_table {
79761 +    /* IN parameters. */
79762 +    domid_t dom;
79763 +    /* OUT parameters. */
79764 +    int16_t status;               /* GNTST_* */
79765 +} gnttab_dump_table_t;
79766 +DEFINE_GUEST_HANDLE(gnttab_dump_table_t);
79767 +
79768 +/*
79769 + * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
79770 + * foreign domain has previously registered its interest in the transfer via
79771 + * <domid, ref>.
79772 + * 
79773 + * Note that, even if the transfer fails, the specified page no longer belongs
79774 + * to the calling domain *unless* the error is GNTST_bad_page.
79775 + */
79776 +#define GNTTABOP_transfer                4
79777 +typedef struct gnttab_transfer {
79778 +    /* IN parameters. */
79779 +    unsigned long mfn;
79780 +    domid_t       domid;
79781 +    grant_ref_t   ref;
79782 +    /* OUT parameters. */
79783 +    int16_t       status;
79784 +} gnttab_transfer_t;
79785 +DEFINE_GUEST_HANDLE(gnttab_transfer_t);
79786 +
79787 +/*
79788 + * Bitfield values for update_pin_status.flags.
79789 + */
79790 + /* Map the grant entry for access by I/O devices. */
79791 +#define _GNTMAP_device_map      (0)
79792 +#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
79793 + /* Map the grant entry for access by host CPUs. */
79794 +#define _GNTMAP_host_map        (1)
79795 +#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
79796 + /* Accesses to the granted frame will be restricted to read-only access. */
79797 +#define _GNTMAP_readonly        (2)
79798 +#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
79799 + /*
79800 +  * GNTMAP_host_map subflag:
79801 +  *  0 => The host mapping is usable only by the guest OS.
79802 +  *  1 => The host mapping is usable by guest OS + current application.
79803 +  */
79804 +#define _GNTMAP_application_map (3)
79805 +#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
79806 +
79807 + /*
79808 +  * GNTMAP_contains_pte subflag:
79809 +  *  0 => This map request contains a host virtual address.
79810 +  *  1 => This map request contains the machine addess of the PTE to update.
79811 +  */
79812 +#define _GNTMAP_contains_pte    (4)
79813 +#define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
79814 +
79815 +/*
79816 + * Values for error status returns. All errors are -ve.
79817 + */
79818 +#define GNTST_okay             (0)  /* Normal return.                        */
79819 +#define GNTST_general_error    (-1) /* General undefined error.              */
79820 +#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
79821 +#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
79822 +#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
79823 +#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
79824 +#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
79825 +#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
79826 +#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
79827 +#define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
79828 +
79829 +#define GNTTABOP_error_msgs {                   \
79830 +    "okay",                                     \
79831 +    "undefined error",                          \
79832 +    "unrecognised domain id",                   \
79833 +    "invalid grant reference",                  \
79834 +    "invalid mapping handle",                   \
79835 +    "invalid virtual address",                  \
79836 +    "invalid device address",                   \
79837 +    "no spare translation slot in the I/O MMU", \
79838 +    "permission denied",                        \
79839 +    "bad page"                                  \
79840 +}
79841 +
79842 +#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
79843 +
79844 +/*
79845 + * Local variables:
79846 + * mode: C
79847 + * c-set-style: "BSD"
79848 + * c-basic-offset: 4
79849 + * tab-width: 4
79850 + * indent-tabs-mode: nil
79851 + * End:
79852 + */
79853 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/hvm/hvm_info_table.h tmp-linux-2.6-xen.patch/include/xen/interface/hvm/hvm_info_table.h
79854 --- ref-linux-2.6.16.9/include/xen/interface/hvm/hvm_info_table.h       1970-01-01 01:00:00.000000000 +0100
79855 +++ tmp-linux-2.6-xen.patch/include/xen/interface/hvm/hvm_info_table.h  2006-04-10 00:05:57.000000000 +0200
79856 @@ -0,0 +1,24 @@
79857 +/******************************************************************************
79858 + * hvm/hvm_info_table.h
79859 + * 
79860 + * HVM parameter and information table, written into guest memory map.
79861 + */
79862 +
79863 +#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
79864 +#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
79865 +
79866 +#define HVM_INFO_PFN         0x09F
79867 +#define HVM_INFO_OFFSET      0x800
79868 +#define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
79869 +
79870 +struct hvm_info_table {
79871 +    char        signature[8]; /* "HVM INFO" */
79872 +    uint32_t    length;
79873 +    uint8_t     checksum;
79874 +    uint8_t     acpi_enabled;
79875 +    uint8_t     apic_enabled;
79876 +    uint8_t     pae_enabled;
79877 +    uint32_t    nr_vcpus;
79878 +};
79879 +
79880 +#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
79881 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/hvm/ioreq.h tmp-linux-2.6-xen.patch/include/xen/interface/hvm/ioreq.h
79882 --- ref-linux-2.6.16.9/include/xen/interface/hvm/ioreq.h        1970-01-01 01:00:00.000000000 +0100
79883 +++ tmp-linux-2.6-xen.patch/include/xen/interface/hvm/ioreq.h   2006-04-10 00:05:57.000000000 +0200
79884 @@ -0,0 +1,94 @@
79885 +/*
79886 + * ioreq.h: I/O request definitions for device models
79887 + * Copyright (c) 2004, Intel Corporation.
79888 + *
79889 + * This program is free software; you can redistribute it and/or modify it
79890 + * under the terms and conditions of the GNU General Public License,
79891 + * version 2, as published by the Free Software Foundation.
79892 + *
79893 + * This program is distributed in the hope it will be useful, but WITHOUT
79894 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
79895 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
79896 + * more details.
79897 + *
79898 + * You should have received a copy of the GNU General Public License along with
79899 + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
79900 + * Place - Suite 330, Boston, MA 02111-1307 USA.
79901 + *
79902 + */
79903 +
79904 +#ifndef _IOREQ_H_
79905 +#define _IOREQ_H_
79906 +
79907 +#define IOREQ_READ      1
79908 +#define IOREQ_WRITE     0
79909 +
79910 +#define STATE_INVALID           0
79911 +#define STATE_IOREQ_READY       1
79912 +#define STATE_IOREQ_INPROCESS   2
79913 +#define STATE_IORESP_READY      3
79914 +#define STATE_IORESP_HOOK       4
79915 +
79916 +#define IOREQ_TYPE_PIO          0 /* pio */
79917 +#define IOREQ_TYPE_COPY         1 /* mmio ops */
79918 +#define IOREQ_TYPE_AND          2
79919 +#define IOREQ_TYPE_OR           3
79920 +#define IOREQ_TYPE_XOR          4
79921 +#define IOREQ_TYPE_XCHG         5
79922 +
79923 +/*
79924 + * VMExit dispatcher should cooperate with instruction decoder to
79925 + * prepare this structure and notify service OS and DM by sending
79926 + * virq
79927 + */
79928 +typedef struct {
79929 +    uint64_t addr;          /*  physical address            */
79930 +    uint64_t size;          /*  size in bytes               */
79931 +    uint64_t count;         /*  for rep prefixes            */
79932 +    union {
79933 +        uint64_t data;      /*  data                        */
79934 +        void    *pdata;     /*  pointer to data             */
79935 +    } u;
79936 +    uint8_t state:4;
79937 +    uint8_t pdata_valid:1;  /* if 1, use pdata above        */
79938 +    uint8_t dir:1;          /*  1=read, 0=write             */
79939 +    uint8_t df:1;
79940 +    uint8_t type;           /* I/O type                     */
79941 +    uint64_t io_count;      /* How many IO done on a vcpu   */
79942 +} ioreq_t;
79943 +
79944 +#define MAX_VECTOR      256
79945 +#define BITS_PER_BYTE   8
79946 +#define INTR_LEN        (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint64_t)))
79947 +#define INTR_LEN_32     (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint32_t)))
79948 +
79949 +typedef struct {
79950 +    uint16_t    pic_elcr;
79951 +    uint16_t    pic_irr;
79952 +    uint16_t    pic_last_irr;
79953 +    uint16_t    pic_clear_irr;
79954 +} global_iodata_t;
79955 +
79956 +typedef struct {
79957 +    ioreq_t         vp_ioreq;
79958 +    /* Event channel port */
79959 +    unsigned int    vp_eport;   /* VMX vcpu uses this to notify DM */
79960 +    unsigned int    dm_eport;   /* DM uses this to notify VMX vcpu */
79961 +} vcpu_iodata_t;
79962 +
79963 +typedef struct {
79964 +    global_iodata_t sp_global;
79965 +    vcpu_iodata_t   vcpu_iodata[1];
79966 +} shared_iopage_t;
79967 +
79968 +#endif /* _IOREQ_H_ */
79969 +
79970 +/*
79971 + * Local variables:
79972 + * mode: C
79973 + * c-set-style: "BSD"
79974 + * c-basic-offset: 4
79975 + * tab-width: 4
79976 + * indent-tabs-mode: nil
79977 + * End:
79978 + */
79979 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/hvm/vmx_assist.h tmp-linux-2.6-xen.patch/include/xen/interface/hvm/vmx_assist.h
79980 --- ref-linux-2.6.16.9/include/xen/interface/hvm/vmx_assist.h   1970-01-01 01:00:00.000000000 +0100
79981 +++ tmp-linux-2.6-xen.patch/include/xen/interface/hvm/vmx_assist.h      2006-04-10 00:05:57.000000000 +0200
79982 @@ -0,0 +1,97 @@
79983 +/*
79984 + * vmx_assist.h: Context definitions for the VMXASSIST world switch.
79985 + *
79986 + * Leendert van Doorn, leendert@watson.ibm.com
79987 + * Copyright (c) 2005, International Business Machines Corporation.
79988 + */
79989 +
79990 +#ifndef _VMX_ASSIST_H_
79991 +#define _VMX_ASSIST_H_
79992 +
79993 +#define VMXASSIST_BASE         0xD0000
79994 +#define VMXASSIST_MAGIC        0x17101966
79995 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8)
79996 +
79997 +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12)
79998 +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4)
79999 +
80000 +#ifndef __ASSEMBLY__
80001 +
80002 +union vmcs_arbytes {
80003 +    struct arbyte_fields {
80004 +        unsigned int seg_type : 4,
80005 +            s         : 1,
80006 +            dpl       : 2,
80007 +            p         : 1,
80008 +            reserved0 : 4,
80009 +            avl       : 1,
80010 +            reserved1 : 1,
80011 +            default_ops_size: 1,
80012 +            g         : 1,
80013 +            null_bit  : 1,
80014 +            reserved2 : 15;
80015 +    } fields;
80016 +    unsigned int bytes;
80017 +};
80018 +
80019 +/*
80020 + * World switch state
80021 + */
80022 +typedef struct vmx_assist_context {
80023 +    uint32_t  eip;        /* execution pointer */
80024 +    uint32_t  esp;        /* stack pointer */
80025 +    uint32_t  eflags;     /* flags register */
80026 +    uint32_t  cr0;
80027 +    uint32_t  cr3;        /* page table directory */
80028 +    uint32_t  cr4;
80029 +    uint32_t  idtr_limit; /* idt */
80030 +    uint32_t  idtr_base;
80031 +    uint32_t  gdtr_limit; /* gdt */
80032 +    uint32_t  gdtr_base;
80033 +    uint32_t  cs_sel;     /* cs selector */
80034 +    uint32_t  cs_limit;
80035 +    uint32_t  cs_base;
80036 +    union vmcs_arbytes cs_arbytes;
80037 +    uint32_t  ds_sel;     /* ds selector */
80038 +    uint32_t  ds_limit;
80039 +    uint32_t  ds_base;
80040 +    union vmcs_arbytes ds_arbytes;
80041 +    uint32_t  es_sel;     /* es selector */
80042 +    uint32_t  es_limit;
80043 +    uint32_t  es_base;
80044 +    union vmcs_arbytes es_arbytes;
80045 +    uint32_t  ss_sel;     /* ss selector */
80046 +    uint32_t  ss_limit;
80047 +    uint32_t  ss_base;
80048 +    union vmcs_arbytes ss_arbytes;
80049 +    uint32_t  fs_sel;     /* fs selector */
80050 +    uint32_t  fs_limit;
80051 +    uint32_t  fs_base;
80052 +    union vmcs_arbytes fs_arbytes;
80053 +    uint32_t  gs_sel;     /* gs selector */
80054 +    uint32_t  gs_limit;
80055 +    uint32_t  gs_base;
80056 +    union vmcs_arbytes gs_arbytes;
80057 +    uint32_t  tr_sel;     /* task selector */
80058 +    uint32_t  tr_limit;
80059 +    uint32_t  tr_base;
80060 +    union vmcs_arbytes tr_arbytes;
80061 +    uint32_t  ldtr_sel;   /* ldtr selector */
80062 +    uint32_t  ldtr_limit;
80063 +    uint32_t  ldtr_base;
80064 +    union vmcs_arbytes ldtr_arbytes;
80065 +} vmx_assist_context_t;
80066 +
80067 +#endif /* __ASSEMBLY__ */
80068 +
80069 +#endif /* _VMX_ASSIST_H_ */
80070 +
80071 +/*
80072 + * Local variables:
80073 + * mode: C
80074 + * c-set-style: "BSD"
80075 + * c-basic-offset: 4
80076 + * tab-width: 4
80077 + * indent-tabs-mode: nil
80078 + * End:
80079 + */
80080 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/blkif.h tmp-linux-2.6-xen.patch/include/xen/interface/io/blkif.h
80081 --- ref-linux-2.6.16.9/include/xen/interface/io/blkif.h 1970-01-01 01:00:00.000000000 +0100
80082 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/blkif.h    2006-04-10 00:05:57.000000000 +0200
80083 @@ -0,0 +1,85 @@
80084 +/******************************************************************************
80085 + * blkif.h
80086 + * 
80087 + * Unified block-device I/O interface for Xen guest OSes.
80088 + * 
80089 + * Copyright (c) 2003-2004, Keir Fraser
80090 + */
80091 +
80092 +#ifndef __XEN_PUBLIC_IO_BLKIF_H__
80093 +#define __XEN_PUBLIC_IO_BLKIF_H__
80094 +
80095 +#include "ring.h"
80096 +#include "../grant_table.h"
80097 +
80098 +/*
80099 + * Front->back notifications: When enqueuing a new request, sending a
80100 + * notification can be made conditional on req_event (i.e., the generic
80101 + * hold-off mechanism provided by the ring macros). Backends must set
80102 + * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
80103 + * 
80104 + * Back->front notifications: When enqueuing a new response, sending a
80105 + * notification can be made conditional on rsp_event (i.e., the generic
80106 + * hold-off mechanism provided by the ring macros). Frontends must set
80107 + * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
80108 + */
80109 +
80110 +#ifndef blkif_vdev_t
80111 +#define blkif_vdev_t   uint16_t
80112 +#endif
80113 +#define blkif_sector_t uint64_t
80114 +
80115 +#define BLKIF_OP_READ      0
80116 +#define BLKIF_OP_WRITE     1
80117 +
80118 +/*
80119 + * Maximum scatter/gather segments per request.
80120 + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
80121 + * NB. This could be 12 if the ring indexes weren't stored in the same page.
80122 + */
80123 +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
80124 +
80125 +typedef struct blkif_request {
80126 +    uint8_t        operation;    /* BLKIF_OP_???                         */
80127 +    uint8_t        nr_segments;  /* number of segments                   */
80128 +    blkif_vdev_t   handle;       /* only for read/write requests         */
80129 +    uint64_t       id;           /* private guest value, echoed in resp  */
80130 +    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
80131 +    struct blkif_request_segment {
80132 +        grant_ref_t gref;        /* reference to I/O buffer frame        */
80133 +        /* @first_sect: first sector in frame to transfer (inclusive).   */
80134 +        /* @last_sect: last sector in frame to transfer (inclusive).     */
80135 +        uint8_t     first_sect, last_sect;
80136 +    } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
80137 +} blkif_request_t;
80138 +
80139 +typedef struct blkif_response {
80140 +    uint64_t        id;              /* copied from request */
80141 +    uint8_t         operation;       /* copied from request */
80142 +    int16_t         status;          /* BLKIF_RSP_???       */
80143 +} blkif_response_t;
80144 +
80145 +#define BLKIF_RSP_ERROR  -1 /* non-specific 'error' */
80146 +#define BLKIF_RSP_OKAY    0 /* non-specific 'okay'  */
80147 +
80148 +/*
80149 + * Generate blkif ring structures and types.
80150 + */
80151 +
80152 +DEFINE_RING_TYPES(blkif, blkif_request_t, blkif_response_t);
80153 +
80154 +#define VDISK_CDROM        0x1
80155 +#define VDISK_REMOVABLE    0x2
80156 +#define VDISK_READONLY     0x4
80157 +
80158 +#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
80159 +
80160 +/*
80161 + * Local variables:
80162 + * mode: C
80163 + * c-set-style: "BSD"
80164 + * c-basic-offset: 4
80165 + * tab-width: 4
80166 + * indent-tabs-mode: nil
80167 + * End:
80168 + */
80169 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/console.h tmp-linux-2.6-xen.patch/include/xen/interface/io/console.h
80170 --- ref-linux-2.6.16.9/include/xen/interface/io/console.h       1970-01-01 01:00:00.000000000 +0100
80171 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/console.h  2006-04-10 00:05:57.000000000 +0200
80172 @@ -0,0 +1,33 @@
80173 +/******************************************************************************
80174 + * console.h
80175 + * 
80176 + * Console I/O interface for Xen guest OSes.
80177 + * 
80178 + * Copyright (c) 2005, Keir Fraser
80179 + */
80180 +
80181 +#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
80182 +#define __XEN_PUBLIC_IO_CONSOLE_H__
80183 +
80184 +typedef uint32_t XENCONS_RING_IDX;
80185 +
80186 +#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
80187 +
80188 +struct xencons_interface {
80189 +    char in[1024];
80190 +    char out[2048];
80191 +    XENCONS_RING_IDX in_cons, in_prod;
80192 +    XENCONS_RING_IDX out_cons, out_prod;
80193 +};
80194 +
80195 +#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
80196 +
80197 +/*
80198 + * Local variables:
80199 + * mode: C
80200 + * c-set-style: "BSD"
80201 + * c-basic-offset: 4
80202 + * tab-width: 4
80203 + * indent-tabs-mode: nil
80204 + * End:
80205 + */
80206 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/netif.h tmp-linux-2.6-xen.patch/include/xen/interface/io/netif.h
80207 --- ref-linux-2.6.16.9/include/xen/interface/io/netif.h 1970-01-01 01:00:00.000000000 +0100
80208 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/netif.h    2006-04-10 00:05:57.000000000 +0200
80209 @@ -0,0 +1,84 @@
80210 +/******************************************************************************
80211 + * netif.h
80212 + * 
80213 + * Unified network-device I/O interface for Xen guest OSes.
80214 + * 
80215 + * Copyright (c) 2003-2004, Keir Fraser
80216 + */
80217 +
80218 +#ifndef __XEN_PUBLIC_IO_NETIF_H__
80219 +#define __XEN_PUBLIC_IO_NETIF_H__
80220 +
80221 +#include "ring.h"
80222 +#include "../grant_table.h"
80223 +
80224 +/*
80225 + * Note that there is *never* any need to notify the backend when enqueuing
80226 + * receive requests (netif_rx_request_t). Notifications after enqueuing any
80227 + * other type of message should be conditional on the appropriate req_event
80228 + * or rsp_event field in the shared ring.
80229 + */
80230 +
80231 +/* Protocol checksum field is blank in the packet (hardware offload)? */
80232 +#define _NETTXF_csum_blank     (0)
80233 +#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
80234 +
80235 +/* Packet data has been validated against protocol checksum. */
80236 +#define _NETTXF_data_validated (1)
80237 +#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
80238 +
80239 +typedef struct netif_tx_request {
80240 +    grant_ref_t gref;      /* Reference to buffer page */
80241 +    uint16_t offset;       /* Offset within buffer page */
80242 +    uint16_t flags;        /* NETTXF_* */
80243 +    uint16_t id;           /* Echoed in response message. */
80244 +    uint16_t size;         /* Packet size in bytes.       */
80245 +} netif_tx_request_t;
80246 +
80247 +typedef struct netif_tx_response {
80248 +    uint16_t id;
80249 +    int16_t  status;       /* NETIF_RSP_* */
80250 +} netif_tx_response_t;
80251 +
80252 +typedef struct {
80253 +    uint16_t    id;        /* Echoed in response message.        */
80254 +    grant_ref_t gref;      /* Reference to incoming granted frame */
80255 +} netif_rx_request_t;
80256 +
80257 +/* Packet data has been validated against protocol checksum. */
80258 +#define _NETRXF_data_validated (0)
80259 +#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
80260 +
80261 +/* Protocol checksum field is blank in the packet (hardware offload)? */
80262 +#define _NETRXF_csum_blank     (1)
80263 +#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
80264 +
80265 +typedef struct {
80266 +    uint16_t id;
80267 +    uint16_t offset;       /* Offset in page of start of received packet  */
80268 +    uint16_t flags;        /* NETRXF_* */
80269 +    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
80270 +} netif_rx_response_t;
80271 +
80272 +/*
80273 + * Generate netif ring structures and types.
80274 + */
80275 +
80276 +DEFINE_RING_TYPES(netif_tx, netif_tx_request_t, netif_tx_response_t);
80277 +DEFINE_RING_TYPES(netif_rx, netif_rx_request_t, netif_rx_response_t);
80278 +
80279 +#define NETIF_RSP_DROPPED         -2
80280 +#define NETIF_RSP_ERROR           -1
80281 +#define NETIF_RSP_OKAY             0
80282 +
80283 +#endif
80284 +
80285 +/*
80286 + * Local variables:
80287 + * mode: C
80288 + * c-set-style: "BSD"
80289 + * c-basic-offset: 4
80290 + * tab-width: 4
80291 + * indent-tabs-mode: nil
80292 + * End:
80293 + */
80294 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/pciif.h tmp-linux-2.6-xen.patch/include/xen/interface/io/pciif.h
80295 --- ref-linux-2.6.16.9/include/xen/interface/io/pciif.h 1970-01-01 01:00:00.000000000 +0100
80296 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/pciif.h    2006-04-10 00:05:57.000000000 +0200
80297 @@ -0,0 +1,55 @@
80298 +/*
80299 + * PCI Backend/Frontend Common Data Structures & Macros
80300 + *
80301 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
80302 + */
80303 +#ifndef __XEN_PCI_COMMON_H__
80304 +#define __XEN_PCI_COMMON_H__
80305 +
80306 +/* Be sure to bump this number if you change this file */
80307 +#define XEN_PCI_MAGIC          "7"
80308 +
80309 +/* xen_pci_sharedinfo flags */
80310 +#define _XEN_PCIF_active     (0)
80311 +#define XEN_PCIF_active      (1<<_XEN_PCI_active)
80312 +
80313 +/* xen_pci_op commands */
80314 +#define XEN_PCI_OP_conf_read    (0)
80315 +#define XEN_PCI_OP_conf_write   (1)
80316 +
80317 +/* xen_pci_op error numbers */
80318 +#define XEN_PCI_ERR_success          (0)
80319 +#define XEN_PCI_ERR_dev_not_found   (-1)
80320 +#define XEN_PCI_ERR_invalid_offset  (-2)
80321 +#define XEN_PCI_ERR_access_denied   (-3)
80322 +#define XEN_PCI_ERR_not_implemented (-4)
80323 +/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
80324 +#define XEN_PCI_ERR_op_failed       (-5)
80325 +
80326 +struct xen_pci_op {
80327 +       /* IN: what action to perform: XEN_PCI_OP_* */
80328 +       uint32_t cmd;
80329 +
80330 +       /* OUT: will contain an error number (if any) from errno.h */
80331 +       int32_t err;
80332 +
80333 +       /* IN: which device to touch */
80334 +       uint32_t domain; /* PCI Domain/Segment */
80335 +       uint32_t bus;
80336 +       uint32_t devfn;
80337 +
80338 +       /* IN: which configuration registers to touch */
80339 +       int32_t offset;
80340 +       int32_t size;
80341 +
80342 +       /* IN/OUT: Contains the result after a READ or the value to WRITE */
80343 +       uint32_t value;
80344 +};
80345 +
80346 +struct xen_pci_sharedinfo {
80347 +       /* flags - XEN_PCIF_* */
80348 +       uint32_t flags;
80349 +       struct xen_pci_op op;
80350 +};
80351 +
80352 +#endif /* __XEN_PCI_COMMON_H__ */
80353 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/ring.h tmp-linux-2.6-xen.patch/include/xen/interface/io/ring.h
80354 --- ref-linux-2.6.16.9/include/xen/interface/io/ring.h  1970-01-01 01:00:00.000000000 +0100
80355 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/ring.h     2006-04-10 00:05:57.000000000 +0200
80356 @@ -0,0 +1,265 @@
80357 +/******************************************************************************
80358 + * ring.h
80359 + * 
80360 + * Shared producer-consumer ring macros.
80361 + *
80362 + * Tim Deegan and Andrew Warfield November 2004.
80363 + */
80364 +
80365 +#ifndef __XEN_PUBLIC_IO_RING_H__
80366 +#define __XEN_PUBLIC_IO_RING_H__
80367 +
80368 +typedef unsigned int RING_IDX;
80369 +
80370 +/* Round a 32-bit unsigned constant down to the nearest power of two. */
80371 +#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
80372 +#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
80373 +#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
80374 +#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
80375 +#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
80376 +
80377 +/*
80378 + * Calculate size of a shared ring, given the total available space for the
80379 + * ring and indexes (_sz), and the name tag of the request/response structure.
80380 + * A ring contains as many entries as will fit, rounded down to the nearest 
80381 + * power of two (so we can mask with (size-1) to loop around).
80382 + */
80383 +#define __RING_SIZE(_s, _sz) \
80384 +    (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
80385 +
80386 +/*
80387 + * Macros to make the correct C datatypes for a new kind of ring.
80388 + * 
80389 + * To make a new ring datatype, you need to have two message structures,
80390 + * let's say request_t, and response_t already defined.
80391 + *
80392 + * In a header where you want the ring datatype declared, you then do:
80393 + *
80394 + *     DEFINE_RING_TYPES(mytag, request_t, response_t);
80395 + *
80396 + * These expand out to give you a set of types, as you can see below.
80397 + * The most important of these are:
80398 + * 
80399 + *     mytag_sring_t      - The shared ring.
80400 + *     mytag_front_ring_t - The 'front' half of the ring.
80401 + *     mytag_back_ring_t  - The 'back' half of the ring.
80402 + *
80403 + * To initialize a ring in your code you need to know the location and size
80404 + * of the shared memory area (PAGE_SIZE, for instance). To initialise
80405 + * the front half:
80406 + *
80407 + *     mytag_front_ring_t front_ring;
80408 + *     SHARED_RING_INIT((mytag_sring_t *)shared_page);
80409 + *     FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
80410 + *
80411 + * Initializing the back follows similarly (note that only the front
80412 + * initializes the shared ring):
80413 + *
80414 + *     mytag_back_ring_t back_ring;
80415 + *     BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
80416 + */
80417 +
80418 +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
80419 +                                                                        \
80420 +/* Shared ring entry */                                                 \
80421 +union __name##_sring_entry {                                            \
80422 +    __req_t req;                                                        \
80423 +    __rsp_t rsp;                                                        \
80424 +};                                                                      \
80425 +                                                                        \
80426 +/* Shared ring page */                                                  \
80427 +struct __name##_sring {                                                 \
80428 +    RING_IDX req_prod, req_event;                                       \
80429 +    RING_IDX rsp_prod, rsp_event;                                       \
80430 +    uint8_t  pad[48];                                                   \
80431 +    union __name##_sring_entry ring[1]; /* variable-length */           \
80432 +};                                                                      \
80433 +                                                                        \
80434 +/* "Front" end's private variables */                                   \
80435 +struct __name##_front_ring {                                            \
80436 +    RING_IDX req_prod_pvt;                                              \
80437 +    RING_IDX rsp_cons;                                                  \
80438 +    unsigned int nr_ents;                                               \
80439 +    struct __name##_sring *sring;                                       \
80440 +};                                                                      \
80441 +                                                                        \
80442 +/* "Back" end's private variables */                                    \
80443 +struct __name##_back_ring {                                             \
80444 +    RING_IDX rsp_prod_pvt;                                              \
80445 +    RING_IDX req_cons;                                                  \
80446 +    unsigned int nr_ents;                                               \
80447 +    struct __name##_sring *sring;                                       \
80448 +};                                                                      \
80449 +                                                                        \
80450 +/* Syntactic sugar */                                                   \
80451 +typedef struct __name##_sring __name##_sring_t;                         \
80452 +typedef struct __name##_front_ring __name##_front_ring_t;               \
80453 +typedef struct __name##_back_ring __name##_back_ring_t
80454 +
80455 +/*
80456 + * Macros for manipulating rings.
80457 + * 
80458 + * FRONT_RING_whatever works on the "front end" of a ring: here 
80459 + * requests are pushed on to the ring and responses taken off it.
80460 + * 
80461 + * BACK_RING_whatever works on the "back end" of a ring: here 
80462 + * requests are taken off the ring and responses put on.
80463 + * 
80464 + * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. 
80465 + * This is OK in 1-for-1 request-response situations where the 
80466 + * requestor (front end) never has more than RING_SIZE()-1
80467 + * outstanding requests.
80468 + */
80469 +
80470 +/* Initialising empty rings */
80471 +#define SHARED_RING_INIT(_s) do {                                       \
80472 +    (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
80473 +    (_s)->req_event = (_s)->rsp_event = 1;                              \
80474 +    memset((_s)->pad, 0, sizeof((_s)->pad));                            \
80475 +} while(0)
80476 +
80477 +#define FRONT_RING_INIT(_r, _s, __size) do {                            \
80478 +    (_r)->req_prod_pvt = 0;                                             \
80479 +    (_r)->rsp_cons = 0;                                                 \
80480 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80481 +    (_r)->sring = (_s);                                                 \
80482 +} while (0)
80483 +
80484 +#define BACK_RING_INIT(_r, _s, __size) do {                             \
80485 +    (_r)->rsp_prod_pvt = 0;                                             \
80486 +    (_r)->req_cons = 0;                                                 \
80487 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80488 +    (_r)->sring = (_s);                                                 \
80489 +} while (0)
80490 +
80491 +/* Initialize to existing shared indexes -- for recovery */
80492 +#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
80493 +    (_r)->sring = (_s);                                                 \
80494 +    (_r)->req_prod_pvt = (_s)->req_prod;                                \
80495 +    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
80496 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80497 +} while (0)
80498 +
80499 +#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
80500 +    (_r)->sring = (_s);                                                 \
80501 +    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
80502 +    (_r)->req_cons = (_s)->req_prod;                                    \
80503 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80504 +} while (0)
80505 +
80506 +/* How big is this ring? */
80507 +#define RING_SIZE(_r)                                                   \
80508 +    ((_r)->nr_ents)
80509 +
80510 +/* Test if there is an empty slot available on the front ring.
80511 + * (This is only meaningful from the front. )
80512 + */
80513 +#define RING_FULL(_r)                                                   \
80514 +    (((_r)->req_prod_pvt - (_r)->rsp_cons) == RING_SIZE(_r))
80515 +
80516 +/* Test if there are outstanding messages to be processed on a ring. */
80517 +#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
80518 +    ((_r)->rsp_cons != (_r)->sring->rsp_prod)
80519 +
80520 +#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
80521 +    (((_r)->req_cons != (_r)->sring->req_prod) &&                       \
80522 +     (((_r)->req_cons - (_r)->rsp_prod_pvt) != RING_SIZE(_r)))
80523 +
80524 +/* Direct access to individual ring elements, by index. */
80525 +#define RING_GET_REQUEST(_r, _idx)                                      \
80526 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
80527 +
80528 +#define RING_GET_RESPONSE(_r, _idx)                                     \
80529 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
80530 +
80531 +/* Loop termination condition: Would the specified index overflow the ring? */
80532 +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
80533 +    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
80534 +
80535 +#define RING_PUSH_REQUESTS(_r) do {                                     \
80536 +    wmb(); /* back sees requests /before/ updated producer index */     \
80537 +    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
80538 +} while (0)
80539 +
80540 +#define RING_PUSH_RESPONSES(_r) do {                                    \
80541 +    wmb(); /* front sees responses /before/ updated producer index */   \
80542 +    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
80543 +} while (0)
80544 +
80545 +/*
80546 + * Notification hold-off (req_event and rsp_event):
80547 + * 
80548 + * When queueing requests or responses on a shared ring, it may not always be
80549 + * necessary to notify the remote end. For example, if requests are in flight
80550 + * in a backend, the front may be able to queue further requests without
80551 + * notifying the back (if the back checks for new requests when it queues
80552 + * responses).
80553 + * 
80554 + * When enqueuing requests or responses:
80555 + * 
80556 + *  Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
80557 + *  is a boolean return value. True indicates that the receiver requires an
80558 + *  asynchronous notification.
80559 + * 
80560 + * After dequeuing requests or responses (before sleeping the connection):
80561 + * 
80562 + *  Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
80563 + *  The second argument is a boolean return value. True indicates that there
80564 + *  are pending messages on the ring (i.e., the connection should not be put
80565 + *  to sleep).
80566 + * 
80567 + *  These macros will set the req_event/rsp_event field to trigger a
80568 + *  notification on the very next message that is enqueued. If you want to
80569 + *  create batches of work (i.e., only receive a notification after several
80570 + *  messages have been enqueued) then you will need to create a customised
80571 + *  version of the FINAL_CHECK macro in your own code, which sets the event
80572 + *  field appropriately.
80573 + */
80574 +
80575 +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {           \
80576 +    RING_IDX __old = (_r)->sring->req_prod;                             \
80577 +    RING_IDX __new = (_r)->req_prod_pvt;                                \
80578 +    wmb(); /* back sees requests /before/ updated producer index */     \
80579 +    (_r)->sring->req_prod = __new;                                      \
80580 +    mb(); /* back sees new requests /before/ we check req_event */      \
80581 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <           \
80582 +                 (RING_IDX)(__new - __old));                            \
80583 +} while (0)
80584 +
80585 +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {          \
80586 +    RING_IDX __old = (_r)->sring->rsp_prod;                             \
80587 +    RING_IDX __new = (_r)->rsp_prod_pvt;                                \
80588 +    wmb(); /* front sees responses /before/ updated producer index */   \
80589 +    (_r)->sring->rsp_prod = __new;                                      \
80590 +    mb(); /* front sees new responses /before/ we check rsp_event */    \
80591 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <           \
80592 +                 (RING_IDX)(__new - __old));                            \
80593 +} while (0)
80594 +
80595 +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {             \
80596 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
80597 +    if (_work_to_do) break;                                             \
80598 +    (_r)->sring->req_event = (_r)->req_cons + 1;                        \
80599 +    mb();                                                               \
80600 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
80601 +} while (0)
80602 +
80603 +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {            \
80604 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
80605 +    if (_work_to_do) break;                                             \
80606 +    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                        \
80607 +    mb();                                                               \
80608 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
80609 +} while (0)
80610 +
80611 +#endif /* __XEN_PUBLIC_IO_RING_H__ */
80612 +
80613 +/*
80614 + * Local variables:
80615 + * mode: C
80616 + * c-set-style: "BSD"
80617 + * c-basic-offset: 4
80618 + * tab-width: 4
80619 + * indent-tabs-mode: nil
80620 + * End:
80621 + */
80622 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/tpmif.h tmp-linux-2.6-xen.patch/include/xen/interface/io/tpmif.h
80623 --- ref-linux-2.6.16.9/include/xen/interface/io/tpmif.h 1970-01-01 01:00:00.000000000 +0100
80624 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/tpmif.h    2006-04-10 00:05:57.000000000 +0200
80625 @@ -0,0 +1,56 @@
80626 +/******************************************************************************
80627 + * tpmif.h
80628 + *
80629 + * TPM I/O interface for Xen guest OSes.
80630 + *
80631 + * Copyright (c) 2005, IBM Corporation
80632 + *
80633 + * Author: Stefan Berger, stefanb@us.ibm.com
80634 + * Grant table support: Mahadevan Gomathisankaran
80635 + *
80636 + * This code has been derived from tools/libxc/xen/io/netif.h
80637 + *
80638 + * Copyright (c) 2003-2004, Keir Fraser
80639 + */
80640 +
80641 +#ifndef __XEN_PUBLIC_IO_TPMIF_H__
80642 +#define __XEN_PUBLIC_IO_TPMIF_H__
80643 +
80644 +#include "../grant_table.h"
80645 +
80646 +typedef struct {
80647 +    unsigned long addr;   /* Machine address of packet.   */
80648 +    grant_ref_t ref;      /* grant table access reference */
80649 +    uint16_t unused;
80650 +    uint16_t size;        /* Packet size in bytes.        */
80651 +} tpmif_tx_request_t;
80652 +
80653 +/*
80654 + * The TPMIF_TX_RING_SIZE defines the number of pages the
80655 + * front-end and backend can exchange (= size of array).
80656 + */
80657 +typedef uint32_t TPMIF_RING_IDX;
80658 +
80659 +#define TPMIF_TX_RING_SIZE 10
80660 +
80661 +/* This structure must fit in a memory page. */
80662 +
80663 +typedef struct {
80664 +    tpmif_tx_request_t req;
80665 +} tpmif_ring_t;
80666 +
80667 +typedef struct {
80668 +    tpmif_ring_t ring[TPMIF_TX_RING_SIZE];
80669 +} tpmif_tx_interface_t;
80670 +
80671 +#endif
80672 +
80673 +/*
80674 + * Local variables:
80675 + * mode: C
80676 + * c-set-style: "BSD"
80677 + * c-basic-offset: 4
80678 + * tab-width: 4
80679 + * indent-tabs-mode: nil
80680 + * End:
80681 + */
80682 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/xenbus.h tmp-linux-2.6-xen.patch/include/xen/interface/io/xenbus.h
80683 --- ref-linux-2.6.16.9/include/xen/interface/io/xenbus.h        1970-01-01 01:00:00.000000000 +0100
80684 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/xenbus.h   2006-04-10 00:05:57.000000000 +0200
80685 @@ -0,0 +1,42 @@
80686 +/*****************************************************************************
80687 + * xenbus.h
80688 + *
80689 + * Xenbus protocol details.
80690 + *
80691 + * Copyright (C) 2005 XenSource Ltd.
80692 + */
80693 +
80694 +#ifndef _XEN_PUBLIC_IO_XENBUS_H
80695 +#define _XEN_PUBLIC_IO_XENBUS_H
80696 +
80697 +/* The state of either end of the Xenbus, i.e. the current communication
80698 +   status of initialisation across the bus.  States here imply nothing about
80699 +   the state of the connection between the driver and the kernel's device
80700 +   layers.  */
80701 +typedef enum
80702 +{
80703 +  XenbusStateUnknown      = 0,
80704 +  XenbusStateInitialising = 1,
80705 +  XenbusStateInitWait     = 2,  /* Finished early initialisation, but waiting
80706 +                                   for information from the peer or hotplug
80707 +                                  scripts. */
80708 +  XenbusStateInitialised  = 3,  /* Initialised and waiting for a connection
80709 +                                  from the peer. */
80710 +  XenbusStateConnected    = 4,
80711 +  XenbusStateClosing      = 5,  /* The device is being closed due to an error
80712 +                                  or an unplug event. */
80713 +  XenbusStateClosed       = 6
80714 +
80715 +} XenbusState;
80716 +
80717 +#endif /* _XEN_PUBLIC_IO_XENBUS_H */
80718 +
80719 +/*
80720 + * Local variables:
80721 + *  c-file-style: "linux"
80722 + *  indent-tabs-mode: t
80723 + *  c-indent-level: 8
80724 + *  c-basic-offset: 8
80725 + *  tab-width: 8
80726 + * End:
80727 + */
80728 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/io/xs_wire.h tmp-linux-2.6-xen.patch/include/xen/interface/io/xs_wire.h
80729 --- ref-linux-2.6.16.9/include/xen/interface/io/xs_wire.h       1970-01-01 01:00:00.000000000 +0100
80730 +++ tmp-linux-2.6-xen.patch/include/xen/interface/io/xs_wire.h  2006-04-10 00:05:57.000000000 +0200
80731 @@ -0,0 +1,97 @@
80732 +/*
80733 + * Details of the "wire" protocol between Xen Store Daemon and client
80734 + * library or guest kernel.
80735 + * Copyright (C) 2005 Rusty Russell IBM Corporation
80736 + */
80737 +
80738 +#ifndef _XS_WIRE_H
80739 +#define _XS_WIRE_H
80740 +
80741 +enum xsd_sockmsg_type
80742 +{
80743 +    XS_DEBUG,
80744 +    XS_DIRECTORY,
80745 +    XS_READ,
80746 +    XS_GET_PERMS,
80747 +    XS_WATCH,
80748 +    XS_UNWATCH,
80749 +    XS_TRANSACTION_START,
80750 +    XS_TRANSACTION_END,
80751 +    XS_INTRODUCE,
80752 +    XS_RELEASE,
80753 +    XS_GET_DOMAIN_PATH,
80754 +    XS_WRITE,
80755 +    XS_MKDIR,
80756 +    XS_RM,
80757 +    XS_SET_PERMS,
80758 +    XS_WATCH_EVENT,
80759 +    XS_ERROR,
80760 +    XS_IS_DOMAIN_INTRODUCED
80761 +};
80762 +
80763 +#define XS_WRITE_NONE "NONE"
80764 +#define XS_WRITE_CREATE "CREATE"
80765 +#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
80766 +
80767 +/* We hand errors as strings, for portability. */
80768 +struct xsd_errors
80769 +{
80770 +    int errnum;
80771 +    const char *errstring;
80772 +};
80773 +#define XSD_ERROR(x) { x, #x }
80774 +static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
80775 +    XSD_ERROR(EINVAL),
80776 +    XSD_ERROR(EACCES),
80777 +    XSD_ERROR(EEXIST),
80778 +    XSD_ERROR(EISDIR),
80779 +    XSD_ERROR(ENOENT),
80780 +    XSD_ERROR(ENOMEM),
80781 +    XSD_ERROR(ENOSPC),
80782 +    XSD_ERROR(EIO),
80783 +    XSD_ERROR(ENOTEMPTY),
80784 +    XSD_ERROR(ENOSYS),
80785 +    XSD_ERROR(EROFS),
80786 +    XSD_ERROR(EBUSY),
80787 +    XSD_ERROR(EAGAIN),
80788 +    XSD_ERROR(EISCONN)
80789 +};
80790 +
80791 +struct xsd_sockmsg
80792 +{
80793 +    uint32_t type;  /* XS_??? */
80794 +    uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
80795 +    uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
80796 +    uint32_t len;   /* Length of data following this. */
80797 +
80798 +    /* Generally followed by nul-terminated string(s). */
80799 +};
80800 +
80801 +enum xs_watch_type
80802 +{
80803 +    XS_WATCH_PATH = 0,
80804 +    XS_WATCH_TOKEN
80805 +};
80806 +
80807 +/* Inter-domain shared memory communications. */
80808 +#define XENSTORE_RING_SIZE 1024
80809 +typedef uint32_t XENSTORE_RING_IDX;
80810 +#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
80811 +struct xenstore_domain_interface {
80812 +    char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
80813 +    char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
80814 +    XENSTORE_RING_IDX req_cons, req_prod;
80815 +    XENSTORE_RING_IDX rsp_cons, rsp_prod;
80816 +};
80817 +
80818 +#endif /* _XS_WIRE_H */
80819 +
80820 +/*
80821 + * Local variables:
80822 + * mode: C
80823 + * c-set-style: "BSD"
80824 + * c-basic-offset: 4
80825 + * tab-width: 4
80826 + * indent-tabs-mode: nil
80827 + * End:
80828 + */
80829 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/memory.h tmp-linux-2.6-xen.patch/include/xen/interface/memory.h
80830 --- ref-linux-2.6.16.9/include/xen/interface/memory.h   1970-01-01 01:00:00.000000000 +0100
80831 +++ tmp-linux-2.6-xen.patch/include/xen/interface/memory.h      2006-04-10 00:05:57.000000000 +0200
80832 @@ -0,0 +1,155 @@
80833 +/******************************************************************************
80834 + * memory.h
80835 + * 
80836 + * Memory reservation and information.
80837 + * 
80838 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
80839 + */
80840 +
80841 +#ifndef __XEN_PUBLIC_MEMORY_H__
80842 +#define __XEN_PUBLIC_MEMORY_H__
80843 +
80844 +/*
80845 + * Increase or decrease the specified domain's memory reservation. Returns a
80846 + * -ve errcode on failure, or the # extents successfully allocated or freed.
80847 + * arg == addr of struct xen_memory_reservation.
80848 + */
80849 +#define XENMEM_increase_reservation 0
80850 +#define XENMEM_decrease_reservation 1
80851 +#define XENMEM_populate_physmap     6
80852 +typedef struct xen_memory_reservation {
80853 +
80854 +    /*
80855 +     * XENMEM_increase_reservation:
80856 +     *   OUT: MFN (*not* GMFN) bases of extents that were allocated
80857 +     * XENMEM_decrease_reservation:
80858 +     *   IN:  GMFN bases of extents to free
80859 +     * XENMEM_populate_physmap:
80860 +     *   IN:  GPFN bases of extents to populate with memory
80861 +     *   OUT: GMFN bases of extents that were allocated
80862 +     *   (NB. This command also updates the mach_to_phys translation table)
80863 +     */
80864 +    GUEST_HANDLE(ulong) extent_start;
80865 +
80866 +    /* Number of extents, and size/alignment of each (2^extent_order pages). */
80867 +    unsigned long  nr_extents;
80868 +    unsigned int   extent_order;
80869 +
80870 +    /*
80871 +     * Maximum # bits addressable by the user of the allocated region (e.g., 
80872 +     * I/O devices often have a 32-bit limitation even in 64-bit systems). If 
80873 +     * zero then the user has no addressing restriction.
80874 +     * This field is not used by XENMEM_decrease_reservation.
80875 +     */
80876 +    unsigned int   address_bits;
80877 +
80878 +    /*
80879 +     * Domain whose reservation is being changed.
80880 +     * Unprivileged domains can specify only DOMID_SELF.
80881 +     */
80882 +    domid_t        domid;
80883 +
80884 +} xen_memory_reservation_t;
80885 +DEFINE_GUEST_HANDLE(xen_memory_reservation_t);
80886 +
80887 +/*
80888 + * Returns the maximum machine frame number of mapped RAM in this system.
80889 + * This command always succeeds (it never returns an error code).
80890 + * arg == NULL.
80891 + */
80892 +#define XENMEM_maximum_ram_page     2
80893 +
80894 +/*
80895 + * Returns the current or maximum memory reservation, in pages, of the
80896 + * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
80897 + * arg == addr of domid_t.
80898 + */
80899 +#define XENMEM_current_reservation  3
80900 +#define XENMEM_maximum_reservation  4
80901 +
80902 +/*
80903 + * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
80904 + * mapping table. Architectures which do not have a m2p table do not implement
80905 + * this command.
80906 + * arg == addr of xen_machphys_mfn_list_t.
80907 + */
80908 +#define XENMEM_machphys_mfn_list    5
80909 +typedef struct xen_machphys_mfn_list {
80910 +    /*
80911 +     * Size of the 'extent_start' array. Fewer entries will be filled if the
80912 +     * machphys table is smaller than max_extents * 2MB.
80913 +     */
80914 +    unsigned int max_extents;
80915 +
80916 +    /*
80917 +     * Pointer to buffer to fill with list of extent starts. If there are
80918 +     * any large discontiguities in the machine address space, 2MB gaps in
80919 +     * the machphys table will be represented by an MFN base of zero.
80920 +     */
80921 +    GUEST_HANDLE(ulong) extent_start;
80922 +
80923 +    /*
80924 +     * Number of extents written to the above array. This will be smaller
80925 +     * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
80926 +     */
80927 +    unsigned int nr_extents;
80928 +} xen_machphys_mfn_list_t;
80929 +DEFINE_GUEST_HANDLE(xen_machphys_mfn_list_t);
80930 +
80931 +/*
80932 + * Sets the GPFN at which a particular page appears in the specified guest's
80933 + * pseudophysical address space.
80934 + * arg == addr of xen_add_to_physmap_t.
80935 + */
80936 +#define XENMEM_add_to_physmap      7
80937 +typedef struct xen_add_to_physmap {
80938 +    /* Which domain to change the mapping for. */
80939 +    domid_t domid;
80940 +
80941 +    /* Source mapping space. */
80942 +#define XENMAPSPACE_shared_info 0 /* shared info page */
80943 +#define XENMAPSPACE_grant_table 1 /* grant table page */
80944 +    unsigned int space;
80945 +
80946 +    /* Index into source mapping space. */
80947 +    unsigned long idx;
80948 +
80949 +    /* GPFN where the source mapping page should appear. */
80950 +    unsigned long gpfn;
80951 +} xen_add_to_physmap_t;
80952 +DEFINE_GUEST_HANDLE(xen_add_to_physmap_t);
80953 +
80954 +/*
80955 + * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
80956 + * code on failure. This call only works for auto-translated guests.
80957 + */
80958 +#define XENMEM_translate_gpfn_list  8
80959 +typedef struct xen_translate_gpfn_list {
80960 +    /* Which domain to translate for? */
80961 +    domid_t domid;
80962 +
80963 +    /* Length of list. */
80964 +    unsigned long nr_gpfns;
80965 +
80966 +    /* List of GPFNs to translate. */
80967 +    GUEST_HANDLE(ulong) gpfn_list;
80968 +
80969 +    /*
80970 +     * Output list to contain MFN translations. May be the same as the input
80971 +     * list (in which case each input GPFN is overwritten with the output MFN).
80972 +     */
80973 +    GUEST_HANDLE(ulong) mfn_list;
80974 +} xen_translate_gpfn_list_t;
80975 +DEFINE_GUEST_HANDLE(xen_translate_gpfn_list_t);
80976 +
80977 +#endif /* __XEN_PUBLIC_MEMORY_H__ */
80978 +
80979 +/*
80980 + * Local variables:
80981 + * mode: C
80982 + * c-set-style: "BSD"
80983 + * c-basic-offset: 4
80984 + * tab-width: 4
80985 + * indent-tabs-mode: nil
80986 + * End:
80987 + */
80988 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/nmi.h tmp-linux-2.6-xen.patch/include/xen/interface/nmi.h
80989 --- ref-linux-2.6.16.9/include/xen/interface/nmi.h      1970-01-01 01:00:00.000000000 +0100
80990 +++ tmp-linux-2.6-xen.patch/include/xen/interface/nmi.h 2006-04-10 00:05:57.000000000 +0200
80991 @@ -0,0 +1,59 @@
80992 +/******************************************************************************
80993 + * nmi.h
80994 + * 
80995 + * NMI callback registration and reason codes.
80996 + * 
80997 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
80998 + */
80999 +
81000 +#ifndef __XEN_PUBLIC_NMI_H__
81001 +#define __XEN_PUBLIC_NMI_H__
81002 +
81003 +/*
81004 + * NMI reason codes:
81005 + * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
81006 + */
81007 + /* I/O-check error reported via ISA port 0x61, bit 6. */
81008 +#define _XEN_NMIREASON_io_error     0
81009 +#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
81010 + /* Parity error reported via ISA port 0x61, bit 7. */
81011 +#define _XEN_NMIREASON_parity_error 1
81012 +#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
81013 + /* Unknown hardware-generated NMI. */
81014 +#define _XEN_NMIREASON_unknown      2
81015 +#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
81016 +
81017 +/*
81018 + * long nmi_op(unsigned int cmd, void *arg)
81019 + * NB. All ops return zero on success, else a negative error code.
81020 + */
81021 +
81022 +/*
81023 + * Register NMI callback for this (calling) VCPU. Currently this only makes
81024 + * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
81025 + * arg == pointer to xennmi_callback structure.
81026 + */
81027 +#define XENNMI_register_callback   0
81028 +typedef struct xennmi_callback {
81029 +    unsigned long handler_address;
81030 +    unsigned long pad;
81031 +} xennmi_callback_t;
81032 +DEFINE_GUEST_HANDLE(xennmi_callback_t);
81033 +
81034 +/*
81035 + * Deregister NMI callback for this (calling) VCPU.
81036 + * arg == NULL.
81037 + */
81038 +#define XENNMI_unregister_callback 1
81039 +
81040 +#endif /* __XEN_PUBLIC_NMI_H__ */
81041 +
81042 +/*
81043 + * Local variables:
81044 + * mode: C
81045 + * c-set-style: "BSD"
81046 + * c-basic-offset: 4
81047 + * tab-width: 4
81048 + * indent-tabs-mode: nil
81049 + * End:
81050 + */
81051 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/physdev.h tmp-linux-2.6-xen.patch/include/xen/interface/physdev.h
81052 --- ref-linux-2.6.16.9/include/xen/interface/physdev.h  1970-01-01 01:00:00.000000000 +0100
81053 +++ tmp-linux-2.6-xen.patch/include/xen/interface/physdev.h     2006-04-10 00:05:57.000000000 +0200
81054 @@ -0,0 +1,71 @@
81055 +
81056 +#ifndef __XEN_PUBLIC_PHYSDEV_H__
81057 +#define __XEN_PUBLIC_PHYSDEV_H__
81058 +
81059 +/* Commands to HYPERVISOR_physdev_op() */
81060 +#define PHYSDEVOP_IRQ_UNMASK_NOTIFY     4
81061 +#define PHYSDEVOP_IRQ_STATUS_QUERY      5
81062 +#define PHYSDEVOP_SET_IOPL              6
81063 +#define PHYSDEVOP_SET_IOBITMAP          7
81064 +#define PHYSDEVOP_APIC_READ             8
81065 +#define PHYSDEVOP_APIC_WRITE            9
81066 +#define PHYSDEVOP_ASSIGN_VECTOR         10
81067 +
81068 +typedef struct physdevop_irq_status_query {
81069 +    /* IN */
81070 +    uint32_t irq;
81071 +    /* OUT */
81072 +/* Need to call PHYSDEVOP_IRQ_UNMASK_NOTIFY when the IRQ has been serviced? */
81073 +#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY (1<<0)
81074 +    uint32_t flags;
81075 +} physdevop_irq_status_query_t;
81076 +
81077 +typedef struct physdevop_set_iopl {
81078 +    /* IN */
81079 +    uint32_t iopl;
81080 +} physdevop_set_iopl_t;
81081 +
81082 +typedef struct physdevop_set_iobitmap {
81083 +    /* IN */
81084 +    uint8_t *bitmap;
81085 +    uint32_t nr_ports;
81086 +} physdevop_set_iobitmap_t;
81087 +
81088 +typedef struct physdevop_apic {
81089 +    /* IN */
81090 +    unsigned long apic_physbase;
81091 +    uint32_t reg;
81092 +    /* IN or OUT */
81093 +    uint32_t value;
81094 +} physdevop_apic_t;
81095 +
81096 +typedef struct physdevop_irq {
81097 +    /* IN */
81098 +    uint32_t irq;
81099 +    /* OUT */
81100 +    uint32_t vector;
81101 +} physdevop_irq_t;
81102 +
81103 +typedef struct physdev_op {
81104 +    uint32_t cmd;
81105 +    union {
81106 +        physdevop_irq_status_query_t      irq_status_query;
81107 +        physdevop_set_iopl_t              set_iopl;
81108 +        physdevop_set_iobitmap_t          set_iobitmap;
81109 +        physdevop_apic_t                  apic_op;
81110 +        physdevop_irq_t                   irq_op;
81111 +    } u;
81112 +} physdev_op_t;
81113 +DEFINE_GUEST_HANDLE(physdev_op_t);
81114 +
81115 +#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
81116 +
81117 +/*
81118 + * Local variables:
81119 + * mode: C
81120 + * c-set-style: "BSD"
81121 + * c-basic-offset: 4
81122 + * tab-width: 4
81123 + * indent-tabs-mode: nil
81124 + * End:
81125 + */
81126 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/sched_ctl.h tmp-linux-2.6-xen.patch/include/xen/interface/sched_ctl.h
81127 --- ref-linux-2.6.16.9/include/xen/interface/sched_ctl.h        1970-01-01 01:00:00.000000000 +0100
81128 +++ tmp-linux-2.6-xen.patch/include/xen/interface/sched_ctl.h   2006-04-10 00:05:57.000000000 +0200
81129 @@ -0,0 +1,64 @@
81130 +/******************************************************************************
81131 + * Generic scheduler control interface.
81132 + *
81133 + * Mark Williamson, (C) 2004 Intel Research Cambridge
81134 + */
81135 +
81136 +#ifndef __XEN_PUBLIC_SCHED_CTL_H__
81137 +#define __XEN_PUBLIC_SCHED_CTL_H__
81138 +
81139 +/* Scheduler types. */
81140 +#define SCHED_BVT      0
81141 +#define SCHED_SEDF     4
81142 +
81143 +/* Set or get info? */
81144 +#define SCHED_INFO_PUT 0
81145 +#define SCHED_INFO_GET 1
81146 +
81147 +/*
81148 + * Generic scheduler control command - used to adjust system-wide scheduler
81149 + * parameters
81150 + */
81151 +struct sched_ctl_cmd {
81152 +    uint32_t sched_id;
81153 +    uint32_t direction;
81154 +    union {
81155 +        struct bvt_ctl {
81156 +            uint32_t ctx_allow;
81157 +        } bvt;
81158 +    } u;
81159 +};
81160 +
81161 +struct sched_adjdom_cmd {
81162 +    uint32_t sched_id;
81163 +    uint32_t direction;
81164 +    domid_t  domain;
81165 +    union {
81166 +        struct bvt_adjdom {
81167 +            uint32_t mcu_adv;      /* mcu advance: inverse of weight */
81168 +            uint32_t warpback;     /* warp? */
81169 +            int32_t  warpvalue;    /* warp value */
81170 +            int64_t  warpl;        /* warp limit */
81171 +            int64_t  warpu;        /* unwarp time requirement */
81172 +        } bvt;
81173 +        struct sedf_adjdom {
81174 +            uint64_t period;
81175 +            uint64_t slice;
81176 +            uint64_t latency;
81177 +            uint32_t extratime;
81178 +            uint32_t weight;
81179 +        } sedf;
81180 +    } u;
81181 +};
81182 +
81183 +#endif /* __XEN_PUBLIC_SCHED_CTL_H__ */
81184 +
81185 +/*
81186 + * Local variables:
81187 + * mode: C
81188 + * c-set-style: "BSD"
81189 + * c-basic-offset: 4
81190 + * tab-width: 4
81191 + * indent-tabs-mode: nil
81192 + * End:
81193 + */
81194 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/sched.h tmp-linux-2.6-xen.patch/include/xen/interface/sched.h
81195 --- ref-linux-2.6.16.9/include/xen/interface/sched.h    1970-01-01 01:00:00.000000000 +0100
81196 +++ tmp-linux-2.6-xen.patch/include/xen/interface/sched.h       2006-04-10 00:05:57.000000000 +0200
81197 @@ -0,0 +1,87 @@
81198 +/******************************************************************************
81199 + * sched.h
81200 + * 
81201 + * Scheduler state interactions
81202 + * 
81203 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
81204 + */
81205 +
81206 +#ifndef __XEN_PUBLIC_SCHED_H__
81207 +#define __XEN_PUBLIC_SCHED_H__
81208 +
81209 +#include "event_channel.h"
81210 +
81211 +/*
81212 + * The prototype for this hypercall is:
81213 + *  long sched_op(int cmd, void *arg)
81214 + * @cmd == SCHEDOP_??? (scheduler operation).
81215 + * @arg == Operation-specific extra argument(s), as described below.
81216 + * 
81217 + * Versions of Xen prior to 3.0.2 provided only the following legacy version
81218 + * of this hypercall, supporting only the commands yield, block and shutdown:
81219 + *  long sched_op(int cmd, unsigned long arg)
81220 + * @cmd == SCHEDOP_??? (scheduler operation).
81221 + * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
81222 + *      == SHUTDOWN_* code (SCHEDOP_shutdown)
81223 + * This legacy version is available to new guests as sched_op_compat().
81224 + */
81225 +
81226 +/*
81227 + * Voluntarily yield the CPU.
81228 + * @arg == NULL.
81229 + */
81230 +#define SCHEDOP_yield       0
81231 +
81232 +/*
81233 + * Block execution of this VCPU until an event is received for processing.
81234 + * If called with event upcalls masked, this operation will atomically
81235 + * reenable event delivery and check for pending events before blocking the
81236 + * VCPU. This avoids a "wakeup waiting" race.
81237 + * @arg == NULL.
81238 + */
81239 +#define SCHEDOP_block       1
81240 +
81241 +/*
81242 + * Halt execution of this domain (all VCPUs) and notify the system controller.
81243 + * @arg == pointer to sched_shutdown structure.
81244 + */
81245 +#define SCHEDOP_shutdown    2
81246 +typedef struct sched_shutdown {
81247 +    unsigned int reason; /* SHUTDOWN_* */
81248 +} sched_shutdown_t;
81249 +DEFINE_GUEST_HANDLE(sched_shutdown_t);
81250 +
81251 +/*
81252 + * Poll a set of event-channel ports. Return when one or more are pending. An
81253 + * optional timeout may be specified.
81254 + * @arg == pointer to sched_poll structure.
81255 + */
81256 +#define SCHEDOP_poll        3
81257 +typedef struct sched_poll {
81258 +    GUEST_HANDLE(evtchn_port_t) ports;
81259 +    unsigned int nr_ports;
81260 +    uint64_t timeout;
81261 +} sched_poll_t;
81262 +DEFINE_GUEST_HANDLE(sched_poll_t);
81263 +
81264 +/*
81265 + * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
81266 + * software to determine the appropriate action. For the most part, Xen does
81267 + * not care about the shutdown code.
81268 + */
81269 +#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
81270 +#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
81271 +#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
81272 +#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
81273 +
81274 +#endif /* __XEN_PUBLIC_SCHED_H__ */
81275 +
81276 +/*
81277 + * Local variables:
81278 + * mode: C
81279 + * c-set-style: "BSD"
81280 + * c-basic-offset: 4
81281 + * tab-width: 4
81282 + * indent-tabs-mode: nil
81283 + * End:
81284 + */
81285 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/trace.h tmp-linux-2.6-xen.patch/include/xen/interface/trace.h
81286 --- ref-linux-2.6.16.9/include/xen/interface/trace.h    1970-01-01 01:00:00.000000000 +0100
81287 +++ tmp-linux-2.6-xen.patch/include/xen/interface/trace.h       2006-04-10 00:05:57.000000000 +0200
81288 @@ -0,0 +1,86 @@
81289 +/******************************************************************************
81290 + * include/public/trace.h
81291 + * 
81292 + * Mark Williamson, (C) 2004 Intel Research Cambridge
81293 + * Copyright (C) 2005 Bin Ren
81294 + */
81295 +
81296 +#ifndef __XEN_PUBLIC_TRACE_H__
81297 +#define __XEN_PUBLIC_TRACE_H__
81298 +
81299 +/* Trace classes */
81300 +#define TRC_CLS_SHIFT 16
81301 +#define TRC_GEN     0x0001f000    /* General trace            */
81302 +#define TRC_SCHED   0x0002f000    /* Xen Scheduler trace      */
81303 +#define TRC_DOM0OP  0x0004f000    /* Xen DOM0 operation trace */
81304 +#define TRC_VMX     0x0008f000    /* Xen VMX trace            */
81305 +#define TRC_MEM     0x000af000    /* Xen memory trace         */
81306 +#define TRC_ALL     0xfffff000
81307 +
81308 +/* Trace subclasses */
81309 +#define TRC_SUBCLS_SHIFT 12
81310 +/* trace subclasses for VMX */
81311 +#define TRC_VMXEXIT  0x00081000   /* VMX exit trace            */
81312 +#define TRC_VMXTIMER 0x00082000   /* VMX timer trace           */
81313 +#define TRC_VMXINT   0x00084000   /* VMX interrupt trace       */
81314 +#define TRC_VMXIO    0x00088000   /* VMX io emulation trace  */
81315 +
81316 +/* Trace events per class */
81317 +
81318 +#define TRC_SCHED_DOM_ADD       (TRC_SCHED +  1)
81319 +#define TRC_SCHED_DOM_REM       (TRC_SCHED +  2)
81320 +#define TRC_SCHED_SLEEP         (TRC_SCHED +  3)
81321 +#define TRC_SCHED_WAKE          (TRC_SCHED +  4)
81322 +#define TRC_SCHED_YIELD         (TRC_SCHED +  5)
81323 +#define TRC_SCHED_BLOCK         (TRC_SCHED +  6)
81324 +#define TRC_SCHED_SHUTDOWN      (TRC_SCHED +  7)
81325 +#define TRC_SCHED_CTL           (TRC_SCHED +  8)
81326 +#define TRC_SCHED_ADJDOM        (TRC_SCHED +  9)
81327 +#define TRC_SCHED_SWITCH        (TRC_SCHED + 10)
81328 +#define TRC_SCHED_S_TIMER_FN    (TRC_SCHED + 11)
81329 +#define TRC_SCHED_T_TIMER_FN    (TRC_SCHED + 12)
81330 +#define TRC_SCHED_DOM_TIMER_FN  (TRC_SCHED + 13)
81331 +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
81332 +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
81333 +
81334 +#define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
81335 +#define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
81336 +#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
81337 +
81338 +/* trace events per subclass */
81339 +#define TRC_VMX_VMEXIT          (TRC_VMXEXIT + 1)
81340 +#define TRC_VMX_VMENTRY         (TRC_VMXEXIT + 2)
81341 +
81342 +#define TRC_VMX_TIMER_INTR      (TRC_VMXTIMER + 1)
81343 +
81344 +#define TRC_VMX_INT             (TRC_VMXINT + 1)
81345 +
81346 +
81347 +/* This structure represents a single trace buffer record. */
81348 +struct t_rec {
81349 +    uint64_t cycles;          /* cycle counter timestamp */
81350 +    uint32_t event;           /* event ID                */
81351 +    unsigned long data[5];    /* event data items        */
81352 +};
81353 +
81354 +/*
81355 + * This structure contains the metadata for a single trace buffer.  The head
81356 + * field, indexes into an array of struct t_rec's.
81357 + */
81358 +struct t_buf {
81359 +    uint32_t cons;      /* Next item to be consumed by control tools. */
81360 +    uint32_t prod;      /* Next item to be produced by Xen.           */
81361 +    /* 'nr_recs' records follow immediately after the meta-data header.    */
81362 +};
81363 +
81364 +#endif /* __XEN_PUBLIC_TRACE_H__ */
81365 +
81366 +/*
81367 + * Local variables:
81368 + * mode: C
81369 + * c-set-style: "BSD"
81370 + * c-basic-offset: 4
81371 + * tab-width: 4
81372 + * indent-tabs-mode: nil
81373 + * End:
81374 + */
81375 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/vcpu.h tmp-linux-2.6-xen.patch/include/xen/interface/vcpu.h
81376 --- ref-linux-2.6.16.9/include/xen/interface/vcpu.h     1970-01-01 01:00:00.000000000 +0100
81377 +++ tmp-linux-2.6-xen.patch/include/xen/interface/vcpu.h        2006-04-10 00:05:57.000000000 +0200
81378 @@ -0,0 +1,119 @@
81379 +/******************************************************************************
81380 + * vcpu.h
81381 + * 
81382 + * VCPU initialisation, query, and hotplug.
81383 + * 
81384 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
81385 + */
81386 +
81387 +#ifndef __XEN_PUBLIC_VCPU_H__
81388 +#define __XEN_PUBLIC_VCPU_H__
81389 +
81390 +/*
81391 + * Prototype for this hypercall is:
81392 + *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
81393 + * @cmd        == VCPUOP_??? (VCPU operation).
81394 + * @vcpuid     == VCPU to operate on.
81395 + * @extra_args == Operation-specific extra arguments (NULL if none).
81396 + */
81397 +
81398 +/*
81399 + * Initialise a VCPU. Each VCPU can be initialised only once. A 
81400 + * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
81401 + * 
81402 + * @extra_arg == pointer to vcpu_guest_context structure containing initial
81403 + *               state for the VCPU.
81404 + */
81405 +#define VCPUOP_initialise           0
81406 +
81407 +/*
81408 + * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
81409 + * if the VCPU has not been initialised (VCPUOP_initialise).
81410 + */
81411 +#define VCPUOP_up                   1
81412 +
81413 +/*
81414 + * Bring down a VCPU (i.e., make it non-runnable).
81415 + * There are a few caveats that callers should observe:
81416 + *  1. This operation may return, and VCPU_is_up may return false, before the
81417 + *     VCPU stops running (i.e., the command is asynchronous). It is a good
81418 + *     idea to ensure that the VCPU has entered a non-critical loop before
81419 + *     bringing it down. Alternatively, this operation is guaranteed
81420 + *     synchronous if invoked by the VCPU itself.
81421 + *  2. After a VCPU is initialised, there is currently no way to drop all its
81422 + *     references to domain memory. Even a VCPU that is down still holds
81423 + *     memory references via its pagetable base pointer and GDT. It is good
81424 + *     practise to move a VCPU onto an 'idle' or default page table, LDT and
81425 + *     GDT before bringing it down.
81426 + */
81427 +#define VCPUOP_down                 2
81428 +
81429 +/* Returns 1 if the given VCPU is up. */
81430 +#define VCPUOP_is_up                3
81431 +
81432 +/*
81433 + * Return information about the state and running time of a VCPU.
81434 + * @extra_arg == pointer to vcpu_runstate_info structure.
81435 + */
81436 +#define VCPUOP_get_runstate_info    4
81437 +typedef struct vcpu_runstate_info {
81438 +    /* VCPU's current state (RUNSTATE_*). */
81439 +    int      state;
81440 +    /* When was current state entered (system time, ns)? */
81441 +    uint64_t state_entry_time;
81442 +    /*
81443 +     * Time spent in each RUNSTATE_* (ns). The sum of these times is
81444 +     * guaranteed not to drift from system time.
81445 +     */
81446 +    uint64_t time[4];
81447 +} vcpu_runstate_info_t;
81448 +
81449 +/* VCPU is currently running on a physical CPU. */
81450 +#define RUNSTATE_running  0
81451 +
81452 +/* VCPU is runnable, but not currently scheduled on any physical CPU. */
81453 +#define RUNSTATE_runnable 1
81454 +
81455 +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
81456 +#define RUNSTATE_blocked  2
81457 +
81458 +/*
81459 + * VCPU is not runnable, but it is not blocked.
81460 + * This is a 'catch all' state for things like hotplug and pauses by the
81461 + * system administrator (or for critical sections in the hypervisor).
81462 + * RUNSTATE_blocked dominates this state (it is the preferred state).
81463 + */
81464 +#define RUNSTATE_offline  3
81465 +
81466 +/*
81467 + * Register a shared memory area from which the guest may obtain its own
81468 + * runstate information without needing to execute a hypercall.
81469 + * Notes:
81470 + *  1. The registered address may be virtual or physical, depending on the
81471 + *     platform. The virtual address should be registered on x86 systems.
81472 + *  2. Only one shared area may be registered per VCPU. The shared area is
81473 + *     updated by the hypervisor each time the VCPU is scheduled. Thus
81474 + *     runstate.state will always be RUNSTATE_running and
81475 + *     runstate.state_entry_time will indicate the system time at which the
81476 + *     VCPU was last scheduled to run.
81477 + * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
81478 + */
81479 +#define VCPUOP_register_runstate_memory_area 5
81480 +typedef struct vcpu_register_runstate_memory_area {
81481 +    union {
81482 +        struct vcpu_runstate_info *v;
81483 +        uint64_t p;
81484 +    } addr;
81485 +} vcpu_register_runstate_memory_area_t;
81486 +
81487 +#endif /* __XEN_PUBLIC_VCPU_H__ */
81488 +
81489 +/*
81490 + * Local variables:
81491 + * mode: C
81492 + * c-set-style: "BSD"
81493 + * c-basic-offset: 4
81494 + * tab-width: 4
81495 + * indent-tabs-mode: nil
81496 + * End:
81497 + */
81498 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/version.h tmp-linux-2.6-xen.patch/include/xen/interface/version.h
81499 --- ref-linux-2.6.16.9/include/xen/interface/version.h  1970-01-01 01:00:00.000000000 +0100
81500 +++ tmp-linux-2.6-xen.patch/include/xen/interface/version.h     2006-04-10 00:05:57.000000000 +0200
81501 @@ -0,0 +1,64 @@
81502 +/******************************************************************************
81503 + * version.h
81504 + * 
81505 + * Xen version, type, and compile information.
81506 + * 
81507 + * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
81508 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
81509 + */
81510 +
81511 +#ifndef __XEN_PUBLIC_VERSION_H__
81512 +#define __XEN_PUBLIC_VERSION_H__
81513 +
81514 +/* NB. All ops return zero on success, except XENVER_version. */
81515 +
81516 +/* arg == NULL; returns major:minor (16:16). */
81517 +#define XENVER_version      0
81518 +
81519 +/* arg == xen_extraversion_t. */
81520 +#define XENVER_extraversion 1
81521 +typedef char xen_extraversion_t[16];
81522 +#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
81523 +
81524 +/* arg == xen_compile_info_t. */
81525 +#define XENVER_compile_info 2
81526 +typedef struct xen_compile_info {
81527 +    char compiler[64];
81528 +    char compile_by[16];
81529 +    char compile_domain[32];
81530 +    char compile_date[32];
81531 +} xen_compile_info_t;
81532 +
81533 +#define XENVER_capabilities 3
81534 +typedef char xen_capabilities_info_t[1024];
81535 +#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
81536 +
81537 +#define XENVER_changeset 4
81538 +typedef char xen_changeset_info_t[64];
81539 +#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
81540 +
81541 +#define XENVER_platform_parameters 5
81542 +typedef struct xen_platform_parameters {
81543 +    unsigned long virt_start;
81544 +} xen_platform_parameters_t;
81545 +
81546 +#define XENVER_get_features 6
81547 +typedef struct xen_feature_info {
81548 +    unsigned int submap_idx;    /* IN: which 32-bit submap to return */
81549 +    uint32_t     submap;        /* OUT: 32-bit submap */
81550 +} xen_feature_info_t;
81551 +
81552 +/* Declares the features reported by XENVER_get_features. */
81553 +#include "features.h"
81554 +
81555 +#endif /* __XEN_PUBLIC_VERSION_H__ */
81556 +
81557 +/*
81558 + * Local variables:
81559 + * mode: C
81560 + * c-set-style: "BSD"
81561 + * c-basic-offset: 4
81562 + * tab-width: 4
81563 + * indent-tabs-mode: nil
81564 + * End:
81565 + */
81566 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/xen-compat.h tmp-linux-2.6-xen.patch/include/xen/interface/xen-compat.h
81567 --- ref-linux-2.6.16.9/include/xen/interface/xen-compat.h       1970-01-01 01:00:00.000000000 +0100
81568 +++ tmp-linux-2.6-xen.patch/include/xen/interface/xen-compat.h  2006-04-10 00:05:57.000000000 +0200
81569 @@ -0,0 +1,31 @@
81570 +/******************************************************************************
81571 + * xen-compat.h
81572 + * 
81573 + * Guest OS interface to Xen.  Compatibility layer.
81574 + * 
81575 + * Copyright (c) 2006, Christian Limpach
81576 + */
81577 +
81578 +#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
81579 +#define __XEN_PUBLIC_XEN_COMPAT_H__
81580 +
81581 +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030101
81582 +
81583 +#if defined(__XEN__)
81584 +/* Xen is built with matching headers and implements the latest interface. */
81585 +#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
81586 +#elif !defined(__XEN_INTERFACE_VERSION__)
81587 +/* Guests which do not specify a version get the legacy interface. */
81588 +#define __XEN_INTERFACE_VERSION__ 0x00000000
81589 +#endif
81590 +
81591 +#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
81592 +#error "These header files do not support the requested interface version."
81593 +#endif
81594 +
81595 +#if __XEN_INTERFACE_VERSION__ < 0x00030101
81596 +#undef __HYPERVISOR_sched_op
81597 +#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
81598 +#endif
81599 +
81600 +#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
81601 diff -Nurp ref-linux-2.6.16.9/include/xen/interface/xen.h tmp-linux-2.6-xen.patch/include/xen/interface/xen.h
81602 --- ref-linux-2.6.16.9/include/xen/interface/xen.h      1970-01-01 01:00:00.000000000 +0100
81603 +++ tmp-linux-2.6-xen.patch/include/xen/interface/xen.h 2006-04-10 00:05:57.000000000 +0200
81604 @@ -0,0 +1,451 @@
81605 +/******************************************************************************
81606 + * xen.h
81607 + * 
81608 + * Guest OS interface to Xen.
81609 + * 
81610 + * Copyright (c) 2004, K A Fraser
81611 + */
81612 +
81613 +#ifndef __XEN_PUBLIC_XEN_H__
81614 +#define __XEN_PUBLIC_XEN_H__
81615 +
81616 +#if defined(__i386__)
81617 +#include "arch-x86_32.h"
81618 +#elif defined(__x86_64__)
81619 +#include "arch-x86_64.h"
81620 +#elif defined(__ia64__)
81621 +#include "arch-ia64.h"
81622 +#else
81623 +#error "Unsupported architecture"
81624 +#endif
81625 +
81626 +/*
81627 + * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
81628 + */
81629 +
81630 +/*
81631 + * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
81632 + *         EAX = return value
81633 + *         (argument registers may be clobbered on return)
81634 + * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. 
81635 + *         RAX = return value
81636 + *         (argument registers not clobbered on return; RCX, R11 are)
81637 + */
81638 +#define __HYPERVISOR_set_trap_table        0
81639 +#define __HYPERVISOR_mmu_update            1
81640 +#define __HYPERVISOR_set_gdt               2
81641 +#define __HYPERVISOR_stack_switch          3
81642 +#define __HYPERVISOR_set_callbacks         4
81643 +#define __HYPERVISOR_fpu_taskswitch        5
81644 +#define __HYPERVISOR_sched_op_compat       6 /* compat as of 0x00030101 */
81645 +#define __HYPERVISOR_dom0_op               7
81646 +#define __HYPERVISOR_set_debugreg          8
81647 +#define __HYPERVISOR_get_debugreg          9
81648 +#define __HYPERVISOR_update_descriptor    10
81649 +#define __HYPERVISOR_memory_op            12
81650 +#define __HYPERVISOR_multicall            13
81651 +#define __HYPERVISOR_update_va_mapping    14
81652 +#define __HYPERVISOR_set_timer_op         15
81653 +#define __HYPERVISOR_event_channel_op     16
81654 +#define __HYPERVISOR_xen_version          17
81655 +#define __HYPERVISOR_console_io           18
81656 +#define __HYPERVISOR_physdev_op           19
81657 +#define __HYPERVISOR_grant_table_op       20
81658 +#define __HYPERVISOR_vm_assist            21
81659 +#define __HYPERVISOR_update_va_mapping_otherdomain 22
81660 +#define __HYPERVISOR_iret                 23 /* x86 only */
81661 +#define __HYPERVISOR_vcpu_op              24
81662 +#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
81663 +#define __HYPERVISOR_mmuext_op            26
81664 +#define __HYPERVISOR_acm_op               27
81665 +#define __HYPERVISOR_nmi_op               28
81666 +#define __HYPERVISOR_sched_op             29
81667 +
81668 +/* 
81669 + * VIRTUAL INTERRUPTS
81670 + * 
81671 + * Virtual interrupts that a guest OS may receive from Xen.
81672 + */
81673 +#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
81674 +#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
81675 +#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
81676 +#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
81677 +#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
81678 +#define NR_VIRQS        8
81679 +
81680 +/*
81681 + * MMU-UPDATE REQUESTS
81682 + * 
81683 + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
81684 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
81685 + * Where the FD has some effect, it is described below.
81686 + * ptr[1:0] specifies the appropriate MMU_* command.
81687 + * 
81688 + * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
81689 + * Updates an entry in a page table. If updating an L1 table, and the new
81690 + * table entry is valid/present, the mapped frame must belong to the FD, if
81691 + * an FD has been specified. If attempting to map an I/O page then the
81692 + * caller assumes the privilege of the FD.
81693 + * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
81694 + * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
81695 + * ptr[:2]  -- Machine address of the page-table entry to modify.
81696 + * val      -- Value to write.
81697 + * 
81698 + * ptr[1:0] == MMU_MACHPHYS_UPDATE:
81699 + * Updates an entry in the machine->pseudo-physical mapping table.
81700 + * ptr[:2]  -- Machine address within the frame whose mapping to modify.
81701 + *             The frame must belong to the FD, if one is specified.
81702 + * val      -- Value to write into the mapping entry.
81703 + */
81704 +#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
81705 +#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
81706 +
81707 +/*
81708 + * MMU EXTENDED OPERATIONS
81709 + * 
81710 + * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
81711 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
81712 + * Where the FD has some effect, it is described below.
81713 + * 
81714 + * cmd: MMUEXT_(UN)PIN_*_TABLE
81715 + * mfn: Machine frame number to be (un)pinned as a p.t. page.
81716 + *      The frame must belong to the FD, if one is specified.
81717 + * 
81718 + * cmd: MMUEXT_NEW_BASEPTR
81719 + * mfn: Machine frame number of new page-table base to install in MMU.
81720 + * 
81721 + * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
81722 + * mfn: Machine frame number of new page-table base to install in MMU
81723 + *      when in user space.
81724 + * 
81725 + * cmd: MMUEXT_TLB_FLUSH_LOCAL
81726 + * No additional arguments. Flushes local TLB.
81727 + * 
81728 + * cmd: MMUEXT_INVLPG_LOCAL
81729 + * linear_addr: Linear address to be flushed from the local TLB.
81730 + * 
81731 + * cmd: MMUEXT_TLB_FLUSH_MULTI
81732 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
81733 + * 
81734 + * cmd: MMUEXT_INVLPG_MULTI
81735 + * linear_addr: Linear address to be flushed.
81736 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
81737 + * 
81738 + * cmd: MMUEXT_TLB_FLUSH_ALL
81739 + * No additional arguments. Flushes all VCPUs' TLBs.
81740 + * 
81741 + * cmd: MMUEXT_INVLPG_ALL
81742 + * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
81743 + * 
81744 + * cmd: MMUEXT_FLUSH_CACHE
81745 + * No additional arguments. Writes back and flushes cache contents.
81746 + * 
81747 + * cmd: MMUEXT_SET_LDT
81748 + * linear_addr: Linear address of LDT base (NB. must be page-aligned).
81749 + * nr_ents: Number of entries in LDT.
81750 + */
81751 +#define MMUEXT_PIN_L1_TABLE      0
81752 +#define MMUEXT_PIN_L2_TABLE      1
81753 +#define MMUEXT_PIN_L3_TABLE      2
81754 +#define MMUEXT_PIN_L4_TABLE      3
81755 +#define MMUEXT_UNPIN_TABLE       4
81756 +#define MMUEXT_NEW_BASEPTR       5
81757 +#define MMUEXT_TLB_FLUSH_LOCAL   6
81758 +#define MMUEXT_INVLPG_LOCAL      7
81759 +#define MMUEXT_TLB_FLUSH_MULTI   8
81760 +#define MMUEXT_INVLPG_MULTI      9
81761 +#define MMUEXT_TLB_FLUSH_ALL    10
81762 +#define MMUEXT_INVLPG_ALL       11
81763 +#define MMUEXT_FLUSH_CACHE      12
81764 +#define MMUEXT_SET_LDT          13
81765 +#define MMUEXT_NEW_USER_BASEPTR 15
81766 +
81767 +#ifndef __ASSEMBLY__
81768 +typedef struct mmuext_op {
81769 +    unsigned int cmd;
81770 +    union {
81771 +        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
81772 +        unsigned long mfn;
81773 +        /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
81774 +        unsigned long linear_addr;
81775 +    } arg1;
81776 +    union {
81777 +        /* SET_LDT */
81778 +        unsigned int nr_ents;
81779 +        /* TLB_FLUSH_MULTI, INVLPG_MULTI */
81780 +        void *vcpumask;
81781 +    } arg2;
81782 +} mmuext_op_t;
81783 +DEFINE_GUEST_HANDLE(mmuext_op_t);
81784 +#endif
81785 +
81786 +/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
81787 +/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
81788 +/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
81789 +#define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
81790 +#define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
81791 +#define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
81792 +#define UVMF_FLUSHTYPE_MASK     (3UL<<0)
81793 +#define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
81794 +#define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
81795 +#define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
81796 +
81797 +/*
81798 + * Commands to HYPERVISOR_console_io().
81799 + */
81800 +#define CONSOLEIO_write         0
81801 +#define CONSOLEIO_read          1
81802 +
81803 +/*
81804 + * Commands to HYPERVISOR_vm_assist().
81805 + */
81806 +#define VMASST_CMD_enable                0
81807 +#define VMASST_CMD_disable               1
81808 +#define VMASST_TYPE_4gb_segments         0
81809 +#define VMASST_TYPE_4gb_segments_notify  1
81810 +#define VMASST_TYPE_writable_pagetables  2
81811 +#define MAX_VMASST_TYPE 2
81812 +
81813 +#ifndef __ASSEMBLY__
81814 +
81815 +typedef uint16_t domid_t;
81816 +
81817 +/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
81818 +#define DOMID_FIRST_RESERVED (0x7FF0U)
81819 +
81820 +/* DOMID_SELF is used in certain contexts to refer to oneself. */
81821 +#define DOMID_SELF (0x7FF0U)
81822 +
81823 +/*
81824 + * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
81825 + * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
81826 + * is useful to ensure that no mappings to the OS's own heap are accidentally
81827 + * installed. (e.g., in Linux this could cause havoc as reference counts
81828 + * aren't adjusted on the I/O-mapping code path).
81829 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
81830 + * be specified by any calling domain.
81831 + */
81832 +#define DOMID_IO   (0x7FF1U)
81833 +
81834 +/*
81835 + * DOMID_XEN is used to allow privileged domains to map restricted parts of
81836 + * Xen's heap space (e.g., the machine_to_phys table).
81837 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
81838 + * the caller is privileged.
81839 + */
81840 +#define DOMID_XEN  (0x7FF2U)
81841 +
81842 +/*
81843 + * Send an array of these to HYPERVISOR_mmu_update().
81844 + * NB. The fields are natural pointer/address size for this architecture.
81845 + */
81846 +typedef struct mmu_update {
81847 +    uint64_t ptr;       /* Machine address of PTE. */
81848 +    uint64_t val;       /* New contents of PTE.    */
81849 +} mmu_update_t;
81850 +DEFINE_GUEST_HANDLE(mmu_update_t);
81851 +
81852 +/*
81853 + * Send an array of these to HYPERVISOR_multicall().
81854 + * NB. The fields are natural register size for this architecture.
81855 + */
81856 +typedef struct multicall_entry {
81857 +    unsigned long op, result;
81858 +    unsigned long args[6];
81859 +} multicall_entry_t;
81860 +DEFINE_GUEST_HANDLE(multicall_entry_t);
81861 +
81862 +/*
81863 + * Event channel endpoints per domain:
81864 + *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
81865 + */
81866 +#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
81867 +
81868 +typedef struct vcpu_time_info {
81869 +    /*
81870 +     * Updates to the following values are preceded and followed by an
81871 +     * increment of 'version'. The guest can therefore detect updates by
81872 +     * looking for changes to 'version'. If the least-significant bit of
81873 +     * the version number is set then an update is in progress and the guest
81874 +     * must wait to read a consistent set of values.
81875 +     * The correct way to interact with the version number is similar to
81876 +     * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
81877 +     */
81878 +    uint32_t version;
81879 +    uint32_t pad0;
81880 +    uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
81881 +    uint64_t system_time;     /* Time, in nanosecs, since boot.    */
81882 +    /*
81883 +     * Current system time:
81884 +     *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
81885 +     * CPU frequency (Hz):
81886 +     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
81887 +     */
81888 +    uint32_t tsc_to_system_mul;
81889 +    int8_t   tsc_shift;
81890 +    int8_t   pad1[3];
81891 +} vcpu_time_info_t; /* 32 bytes */
81892 +
81893 +typedef struct vcpu_info {
81894 +    /*
81895 +     * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
81896 +     * a pending notification for a particular VCPU. It is then cleared 
81897 +     * by the guest OS /before/ checking for pending work, thus avoiding
81898 +     * a set-and-check race. Note that the mask is only accessed by Xen
81899 +     * on the CPU that is currently hosting the VCPU. This means that the
81900 +     * pending and mask flags can be updated by the guest without special
81901 +     * synchronisation (i.e., no need for the x86 LOCK prefix).
81902 +     * This may seem suboptimal because if the pending flag is set by
81903 +     * a different CPU then an IPI may be scheduled even when the mask
81904 +     * is set. However, note:
81905 +     *  1. The task of 'interrupt holdoff' is covered by the per-event-
81906 +     *     channel mask bits. A 'noisy' event that is continually being
81907 +     *     triggered can be masked at source at this very precise
81908 +     *     granularity.
81909 +     *  2. The main purpose of the per-VCPU mask is therefore to restrict
81910 +     *     reentrant execution: whether for concurrency control, or to
81911 +     *     prevent unbounded stack usage. Whatever the purpose, we expect
81912 +     *     that the mask will be asserted only for short periods at a time,
81913 +     *     and so the likelihood of a 'spurious' IPI is suitably small.
81914 +     * The mask is read before making an event upcall to the guest: a
81915 +     * non-zero mask therefore guarantees that the VCPU will not receive
81916 +     * an upcall activation. The mask is cleared when the VCPU requests
81917 +     * to block: this avoids wakeup-waiting races.
81918 +     */
81919 +    uint8_t evtchn_upcall_pending;
81920 +    uint8_t evtchn_upcall_mask;
81921 +    unsigned long evtchn_pending_sel;
81922 +    arch_vcpu_info_t arch;
81923 +    vcpu_time_info_t time;
81924 +} vcpu_info_t; /* 64 bytes (x86) */
81925 +
81926 +/*
81927 + * Xen/kernel shared data -- pointer provided in start_info.
81928 + * NB. We expect that this struct is smaller than a page.
81929 + */
81930 +typedef struct shared_info {
81931 +    vcpu_info_t vcpu_info[MAX_VIRT_CPUS];
81932 +
81933 +    /*
81934 +     * A domain can create "event channels" on which it can send and receive
81935 +     * asynchronous event notifications. There are three classes of event that
81936 +     * are delivered by this mechanism:
81937 +     *  1. Bi-directional inter- and intra-domain connections. Domains must
81938 +     *     arrange out-of-band to set up a connection (usually by allocating
81939 +     *     an unbound 'listener' port and avertising that via a storage service
81940 +     *     such as xenstore).
81941 +     *  2. Physical interrupts. A domain with suitable hardware-access
81942 +     *     privileges can bind an event-channel port to a physical interrupt
81943 +     *     source.
81944 +     *  3. Virtual interrupts ('events'). A domain can bind an event-channel
81945 +     *     port to a virtual interrupt source, such as the virtual-timer
81946 +     *     device or the emergency console.
81947 +     * 
81948 +     * Event channels are addressed by a "port index". Each channel is
81949 +     * associated with two bits of information:
81950 +     *  1. PENDING -- notifies the domain that there is a pending notification
81951 +     *     to be processed. This bit is cleared by the guest.
81952 +     *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
81953 +     *     will cause an asynchronous upcall to be scheduled. This bit is only
81954 +     *     updated by the guest. It is read-only within Xen. If a channel
81955 +     *     becomes pending while the channel is masked then the 'edge' is lost
81956 +     *     (i.e., when the channel is unmasked, the guest must manually handle
81957 +     *     pending notifications as no upcall will be scheduled by Xen).
81958 +     * 
81959 +     * To expedite scanning of pending notifications, any 0->1 pending
81960 +     * transition on an unmasked channel causes a corresponding bit in a
81961 +     * per-vcpu selector word to be set. Each bit in the selector covers a
81962 +     * 'C long' in the PENDING bitfield array.
81963 +     */
81964 +    unsigned long evtchn_pending[sizeof(unsigned long) * 8];
81965 +    unsigned long evtchn_mask[sizeof(unsigned long) * 8];
81966 +
81967 +    /*
81968 +     * Wallclock time: updated only by control software. Guests should base
81969 +     * their gettimeofday() syscall on this wallclock-base value.
81970 +     */
81971 +    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
81972 +    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
81973 +    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
81974 +
81975 +    arch_shared_info_t arch;
81976 +
81977 +} shared_info_t;
81978 +
81979 +/*
81980 + * Start-of-day memory layout for the initial domain (DOM0):
81981 + *  1. The domain is started within contiguous virtual-memory region.
81982 + *  2. The contiguous region begins and ends on an aligned 4MB boundary.
81983 + *  3. The region start corresponds to the load address of the OS image.
81984 + *     If the load address is not 4MB aligned then the address is rounded down.
81985 + *  4. This the order of bootstrap elements in the initial virtual region:
81986 + *      a. relocated kernel image
81987 + *      b. initial ram disk              [mod_start, mod_len]
81988 + *      c. list of allocated page frames [mfn_list, nr_pages]
81989 + *      d. start_info_t structure        [register ESI (x86)]
81990 + *      e. bootstrap page tables         [pt_base, CR3 (x86)]
81991 + *      f. bootstrap stack               [register ESP (x86)]
81992 + *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
81993 + *  6. The initial ram disk may be omitted.
81994 + *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
81995 + *     layout for the domain. In particular, the bootstrap virtual-memory
81996 + *     region is a 1:1 mapping to the first section of the pseudo-physical map.
81997 + *  8. All bootstrap elements are mapped read-writable for the guest OS. The
81998 + *     only exception is the bootstrap page table, which is mapped read-only.
81999 + *  9. There is guaranteed to be at least 512kB padding after the final
82000 + *     bootstrap element. If necessary, the bootstrap virtual region is
82001 + *     extended by an extra 4MB to ensure this.
82002 + */
82003 +
82004 +#define MAX_GUEST_CMDLINE 1024
82005 +typedef struct start_info {
82006 +    /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
82007 +    char magic[32];             /* "xen-<version>-<platform>".            */
82008 +    unsigned long nr_pages;     /* Total pages allocated to this domain.  */
82009 +    unsigned long shared_info;  /* MACHINE address of shared info struct. */
82010 +    uint32_t flags;             /* SIF_xxx flags.                         */
82011 +    unsigned long store_mfn;    /* MACHINE page number of shared page.    */
82012 +    uint32_t store_evtchn;      /* Event channel for store communication. */
82013 +    unsigned long console_mfn;  /* MACHINE address of console page.       */
82014 +    uint32_t console_evtchn;    /* Event channel for console messages.    */
82015 +    /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
82016 +    unsigned long pt_base;      /* VIRTUAL address of page directory.     */
82017 +    unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
82018 +    unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
82019 +    unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
82020 +    unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
82021 +    int8_t cmd_line[MAX_GUEST_CMDLINE];
82022 +} start_info_t;
82023 +
82024 +/* These flags are passed in the 'flags' field of start_info_t. */
82025 +#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
82026 +#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
82027 +
82028 +typedef uint64_t cpumap_t;
82029 +
82030 +typedef uint8_t xen_domain_handle_t[16];
82031 +
82032 +/* Turn a plain number into a C unsigned long constant. */
82033 +#define __mk_unsigned_long(x) x ## UL
82034 +#define mk_unsigned_long(x) __mk_unsigned_long(x)
82035 +
82036 +#else /* __ASSEMBLY__ */
82037 +
82038 +/* In assembly code we cannot use C numeric constant suffixes. */
82039 +#define mk_unsigned_long(x) x
82040 +
82041 +#endif /* !__ASSEMBLY__ */
82042 +
82043 +#include "xen-compat.h"
82044 +
82045 +#endif /* __XEN_PUBLIC_XEN_H__ */
82046 +
82047 +/*
82048 + * Local variables:
82049 + * mode: C
82050 + * c-set-style: "BSD"
82051 + * c-basic-offset: 4
82052 + * tab-width: 4
82053 + * indent-tabs-mode: nil
82054 + * End:
82055 + */
82056 diff -Nurp ref-linux-2.6.16.9/include/xen/net_driver_util.h tmp-linux-2.6-xen.patch/include/xen/net_driver_util.h
82057 --- ref-linux-2.6.16.9/include/xen/net_driver_util.h    1970-01-01 01:00:00.000000000 +0100
82058 +++ tmp-linux-2.6-xen.patch/include/xen/net_driver_util.h       2006-04-10 00:05:52.000000000 +0200
82059 @@ -0,0 +1,58 @@
82060 +/*****************************************************************************
82061 + *
82062 + * Utility functions for Xen network devices.
82063 + *
82064 + * Copyright (c) 2005 XenSource Ltd.
82065 + * 
82066 + * This program is free software; you can redistribute it and/or
82067 + * modify it under the terms of the GNU General Public License version 2
82068 + * as published by the Free Software Foundation; or, when distributed
82069 + * separately from the Linux kernel or incorporated into other
82070 + * software packages, subject to the following license:
82071 + * 
82072 + * Permission is hereby granted, free of charge, to any person obtaining a
82073 + * copy of this source file (the "Software"), to deal in the Software without
82074 + * restriction, including without limitation the rights to use, copy, modify,
82075 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82076 + * and to permit persons to whom the Software is furnished to do so, subject
82077 + * to the following conditions:
82078 + * 
82079 + * The above copyright notice and this permission notice shall be included in
82080 + * all copies or substantial portions of the Software.
82081 + * 
82082 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82083 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82084 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82085 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82086 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82087 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
82088 + * DEALINGS IN THE SOFTWARE.
82089 + */
82090 +
82091 +#ifndef _ASM_XEN_NET_DRIVER_UTIL_H
82092 +#define _ASM_XEN_NET_DRIVER_UTIL_H
82093 +
82094 +
82095 +#include <xen/xenbus.h>
82096 +
82097 +
82098 +/**
82099 + * Read the 'mac' node at the given device's node in the store, and parse that
82100 + * as colon-separated octets, placing result the given mac array.  mac must be
82101 + * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h).
82102 + * Return 0 on success, or -errno on error.
82103 + */
82104 +int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]);
82105 +
82106 +
82107 +#endif /* _ASM_XEN_NET_DRIVER_UTIL_H */
82108 +
82109 +/*
82110 + * Local variables:
82111 + *  c-file-style: "linux"
82112 + *  indent-tabs-mode: t
82113 + *  c-indent-level: 8
82114 + *  c-basic-offset: 8
82115 + *  tab-width: 8
82116 + * End:
82117 + */
82118 diff -Nurp ref-linux-2.6.16.9/include/xen/pcifront.h tmp-linux-2.6-xen.patch/include/xen/pcifront.h
82119 --- ref-linux-2.6.16.9/include/xen/pcifront.h   1970-01-01 01:00:00.000000000 +0100
82120 +++ tmp-linux-2.6-xen.patch/include/xen/pcifront.h      2006-04-10 00:05:52.000000000 +0200
82121 @@ -0,0 +1,39 @@
82122 +/*
82123 + * PCI Frontend - arch-dependendent declarations
82124 + *
82125 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
82126 + */
82127 +#ifndef __XEN_ASM_PCIFRONT_H__
82128 +#define __XEN_ASM_PCIFRONT_H__
82129 +
82130 +#include <linux/config.h>
82131 +#include <linux/spinlock.h>
82132 +
82133 +#ifdef __KERNEL__
82134 +
82135 +struct pcifront_device;
82136 +
82137 +struct pcifront_sd {
82138 +       int domain;
82139 +       struct pcifront_device *pdev;
82140 +};
82141 +
82142 +struct pci_bus;
82143 +
82144 +#ifdef CONFIG_PCI_DOMAINS
82145 +static inline int pci_domain_nr(struct pci_bus *bus)
82146 +{
82147 +       struct pcifront_sd *sd = bus->sysdata;
82148 +       return sd->domain;
82149 +}
82150 +static inline int pci_proc_domain(struct pci_bus *bus)
82151 +{
82152 +       return pci_domain_nr(bus);
82153 +}
82154 +#endif /* CONFIG_PCI_DOMAINS */
82155 +
82156 +extern spinlock_t pci_bus_lock;
82157 +
82158 +#endif /* __KERNEL__ */
82159 +
82160 +#endif /* __XEN_ASM_PCIFRONT_H__ */
82161 diff -Nurp ref-linux-2.6.16.9/include/xen/public/evtchn.h tmp-linux-2.6-xen.patch/include/xen/public/evtchn.h
82162 --- ref-linux-2.6.16.9/include/xen/public/evtchn.h      1970-01-01 01:00:00.000000000 +0100
82163 +++ tmp-linux-2.6-xen.patch/include/xen/public/evtchn.h 2006-04-10 00:05:52.000000000 +0200
82164 @@ -0,0 +1,101 @@
82165 +/******************************************************************************
82166 + * evtchn.h
82167 + * 
82168 + * Interface to /dev/xen/evtchn.
82169 + * 
82170 + * Copyright (c) 2003-2005, K A Fraser
82171 + * 
82172 + * This program is free software; you can redistribute it and/or
82173 + * modify it under the terms of the GNU General Public License version 2
82174 + * as published by the Free Software Foundation; or, when distributed
82175 + * separately from the Linux kernel or incorporated into other
82176 + * software packages, subject to the following license:
82177 + * 
82178 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82179 + * of this source file (the "Software"), to deal in the Software without
82180 + * restriction, including without limitation the rights to use, copy, modify,
82181 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82182 + * and to permit persons to whom the Software is furnished to do so, subject to
82183 + * the following conditions:
82184 + * 
82185 + * The above copyright notice and this permission notice shall be included in
82186 + * all copies or substantial portions of the Software.
82187 + * 
82188 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82189 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82190 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82191 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82192 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82193 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82194 + * IN THE SOFTWARE.
82195 + */
82196 +
82197 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
82198 +#define __LINUX_PUBLIC_EVTCHN_H__
82199 +
82200 +/* /dev/xen/evtchn resides at device number major=10, minor=201 */
82201 +#define EVTCHN_MINOR 201
82202 +
82203 +/*
82204 + * Bind a fresh port to VIRQ @virq.
82205 + * Return allocated port.
82206 + */
82207 +#define IOCTL_EVTCHN_BIND_VIRQ                         \
82208 +       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
82209 +struct ioctl_evtchn_bind_virq {
82210 +       unsigned int virq;
82211 +};
82212 +
82213 +/*
82214 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
82215 + * Return allocated port.
82216 + */
82217 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
82218 +       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
82219 +struct ioctl_evtchn_bind_interdomain {
82220 +       unsigned int remote_domain, remote_port;
82221 +};
82222 +
82223 +/*
82224 + * Allocate a fresh port for binding to @remote_domain.
82225 + * Return allocated port.
82226 + */
82227 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
82228 +       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
82229 +struct ioctl_evtchn_bind_unbound_port {
82230 +       unsigned int remote_domain;
82231 +};
82232 +
82233 +/*
82234 + * Unbind previously allocated @port.
82235 + */
82236 +#define IOCTL_EVTCHN_UNBIND                            \
82237 +       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
82238 +struct ioctl_evtchn_unbind {
82239 +       unsigned int port;
82240 +};
82241 +
82242 +/*
82243 + * Unbind previously allocated @port.
82244 + */
82245 +#define IOCTL_EVTCHN_NOTIFY                            \
82246 +       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
82247 +struct ioctl_evtchn_notify {
82248 +       unsigned int port;
82249 +};
82250 +
82251 +/* Clear and reinitialise the event buffer. Clear error condition. */
82252 +#define IOCTL_EVTCHN_RESET                             \
82253 +       _IOC(_IOC_NONE, 'E', 5, 0)
82254 +
82255 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
82256 +
82257 +/*
82258 + * Local variables:
82259 + *  c-file-style: "linux"
82260 + *  indent-tabs-mode: t
82261 + *  c-indent-level: 8
82262 + *  c-basic-offset: 8
82263 + *  tab-width: 8
82264 + * End:
82265 + */
82266 diff -Nurp ref-linux-2.6.16.9/include/xen/public/privcmd.h tmp-linux-2.6-xen.patch/include/xen/public/privcmd.h
82267 --- ref-linux-2.6.16.9/include/xen/public/privcmd.h     1970-01-01 01:00:00.000000000 +0100
82268 +++ tmp-linux-2.6-xen.patch/include/xen/public/privcmd.h        2006-04-10 00:05:52.000000000 +0200
82269 @@ -0,0 +1,94 @@
82270 +/******************************************************************************
82271 + * privcmd.h
82272 + * 
82273 + * Interface to /proc/xen/privcmd.
82274 + * 
82275 + * Copyright (c) 2003-2005, K A Fraser
82276 + * 
82277 + * This program is free software; you can redistribute it and/or
82278 + * modify it under the terms of the GNU General Public License version 2
82279 + * as published by the Free Software Foundation; or, when distributed
82280 + * separately from the Linux kernel or incorporated into other
82281 + * software packages, subject to the following license:
82282 + * 
82283 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82284 + * of this source file (the "Software"), to deal in the Software without
82285 + * restriction, including without limitation the rights to use, copy, modify,
82286 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82287 + * and to permit persons to whom the Software is furnished to do so, subject to
82288 + * the following conditions:
82289 + * 
82290 + * The above copyright notice and this permission notice shall be included in
82291 + * all copies or substantial portions of the Software.
82292 + * 
82293 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82294 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82295 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82296 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82297 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82298 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82299 + * IN THE SOFTWARE.
82300 + */
82301 +
82302 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
82303 +#define __LINUX_PUBLIC_PRIVCMD_H__
82304 +
82305 +#ifndef __user
82306 +#define __user
82307 +#endif
82308 +
82309 +typedef struct privcmd_hypercall
82310 +{
82311 +       unsigned long op;
82312 +       unsigned long arg[5];
82313 +} privcmd_hypercall_t;
82314 +
82315 +typedef struct privcmd_mmap_entry {
82316 +       unsigned long va;
82317 +       unsigned long mfn;
82318 +       unsigned long npages;
82319 +} privcmd_mmap_entry_t; 
82320 +
82321 +typedef struct privcmd_mmap {
82322 +       int num;
82323 +       domid_t dom; /* target domain */
82324 +       privcmd_mmap_entry_t __user *entry;
82325 +} privcmd_mmap_t; 
82326 +
82327 +typedef struct privcmd_mmapbatch {
82328 +       int num;     /* number of pages to populate */
82329 +       domid_t dom; /* target domain */
82330 +       unsigned long addr;  /* virtual address */
82331 +       unsigned long __user *arr; /* array of mfns - top nibble set on err */
82332 +} privcmd_mmapbatch_t; 
82333 +
82334 +typedef struct privcmd_blkmsg
82335 +{
82336 +       unsigned long op;
82337 +       void         *buf;
82338 +       int           buf_size;
82339 +} privcmd_blkmsg_t;
82340 +
82341 +/*
82342 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
82343 + * @arg: &privcmd_hypercall_t
82344 + * Return: Value returned from execution of the specified hypercall.
82345 + */
82346 +#define IOCTL_PRIVCMD_HYPERCALL                                        \
82347 +       _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
82348 +#define IOCTL_PRIVCMD_MMAP                                     \
82349 +       _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
82350 +#define IOCTL_PRIVCMD_MMAPBATCH                                        \
82351 +       _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
82352 +
82353 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
82354 +
82355 +/*
82356 + * Local variables:
82357 + *  c-file-style: "linux"
82358 + *  indent-tabs-mode: t
82359 + *  c-indent-level: 8
82360 + *  c-basic-offset: 8
82361 + *  tab-width: 8
82362 + * End:
82363 + */
82364 diff -Nurp ref-linux-2.6.16.9/include/xen/tpmfe.h tmp-linux-2.6-xen.patch/include/xen/tpmfe.h
82365 --- ref-linux-2.6.16.9/include/xen/tpmfe.h      1970-01-01 01:00:00.000000000 +0100
82366 +++ tmp-linux-2.6-xen.patch/include/xen/tpmfe.h 2006-04-10 00:05:52.000000000 +0200
82367 @@ -0,0 +1,40 @@
82368 +#ifndef TPM_FE_H
82369 +#define TPM_FE_H
82370 +
82371 +struct tpm_private;
82372 +
82373 +struct tpmfe_device {
82374 +       /*
82375 +        * Let upper layer receive data from front-end
82376 +        */
82377 +       int (*receive)(const u8 *buffer, size_t count, const void *ptr);
82378 +       /*
82379 +        * Indicate the status of the front-end to the upper
82380 +        * layer.
82381 +        */
82382 +       void (*status)(unsigned int flags);
82383 +
82384 +       /*
82385 +        * This field indicates the maximum size the driver can
82386 +        * transfer in one chunk. It is filled out by the front-end
82387 +        * driver and should be propagated to the generic tpm driver
82388 +        * for allocation of buffers.
82389 +        */
82390 +       unsigned int max_tx_size;
82391 +       /*
82392 +        * The following is a private structure of the underlying
82393 +        * driver. It's expected as first parameter in the send function.
82394 +        */
82395 +       struct tpm_private *tpm_private;
82396 +};
82397 +
82398 +enum {
82399 +       TPMFE_STATUS_DISCONNECTED = 0x0,
82400 +       TPMFE_STATUS_CONNECTED = 0x1
82401 +};
82402 +
82403 +int tpm_fe_send(struct tpm_private * tp, const u8 * buf, size_t count, void *ptr);
82404 +int tpm_fe_register_receiver(struct tpmfe_device *);
82405 +void tpm_fe_unregister_receiver(void);
82406 +
82407 +#endif
82408 diff -Nurp ref-linux-2.6.16.9/include/xen/xenbus.h tmp-linux-2.6-xen.patch/include/xen/xenbus.h
82409 --- ref-linux-2.6.16.9/include/xen/xenbus.h     1970-01-01 01:00:00.000000000 +0100
82410 +++ tmp-linux-2.6-xen.patch/include/xen/xenbus.h        2006-04-10 00:05:52.000000000 +0200
82411 @@ -0,0 +1,306 @@
82412 +/******************************************************************************
82413 + * xenbus.h
82414 + *
82415 + * Talks to Xen Store to figure out what devices we have.
82416 + *
82417 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
82418 + * Copyright (C) 2005 XenSource Ltd.
82419 + * 
82420 + * This program is free software; you can redistribute it and/or
82421 + * modify it under the terms of the GNU General Public License version 2
82422 + * as published by the Free Software Foundation; or, when distributed
82423 + * separately from the Linux kernel or incorporated into other
82424 + * software packages, subject to the following license:
82425 + * 
82426 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82427 + * of this source file (the "Software"), to deal in the Software without
82428 + * restriction, including without limitation the rights to use, copy, modify,
82429 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82430 + * and to permit persons to whom the Software is furnished to do so, subject to
82431 + * the following conditions:
82432 + * 
82433 + * The above copyright notice and this permission notice shall be included in
82434 + * all copies or substantial portions of the Software.
82435 + * 
82436 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82437 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82438 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82439 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82440 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82441 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82442 + * IN THE SOFTWARE.
82443 + */
82444 +
82445 +#ifndef _XEN_XENBUS_H
82446 +#define _XEN_XENBUS_H
82447 +
82448 +#include <linux/device.h>
82449 +#include <linux/notifier.h>
82450 +#include <linux/mutex.h>
82451 +#include <xen/interface/xen.h>
82452 +#include <xen/interface/grant_table.h>
82453 +#include <xen/interface/io/xenbus.h>
82454 +#include <xen/interface/io/xs_wire.h>
82455 +
82456 +#define XBT_NULL 0
82457 +
82458 +/* Register callback to watch this node. */
82459 +struct xenbus_watch
82460 +{
82461 +       struct list_head list;
82462 +
82463 +       /* Path being watched. */
82464 +       const char *node;
82465 +
82466 +       /* Callback (executed in a process context with no locks held). */
82467 +       void (*callback)(struct xenbus_watch *,
82468 +                        const char **vec, unsigned int len);
82469 +
82470 +       /* See XBWF_ definitions below. */
82471 +       unsigned long flags;
82472 +};
82473 +
82474 +/*
82475 + * Execute callback in its own kthread. Useful if the callback is long
82476 + * running or heavily serialised, to avoid taking out the main xenwatch thread
82477 + * for a long period of time (or even unwittingly causing a deadlock).
82478 + */
82479 +#define XBWF_new_thread        1
82480 +
82481 +/* A xenbus device. */
82482 +struct xenbus_device {
82483 +       const char *devicetype;
82484 +       const char *nodename;
82485 +       const char *otherend;
82486 +       int otherend_id;
82487 +       struct xenbus_watch otherend_watch;
82488 +       struct device dev;
82489 +       XenbusState state;
82490 +       void *data;
82491 +};
82492 +
82493 +static inline struct xenbus_device *to_xenbus_device(struct device *dev)
82494 +{
82495 +       return container_of(dev, struct xenbus_device, dev);
82496 +}
82497 +
82498 +struct xenbus_device_id
82499 +{
82500 +       /* .../device/<device_type>/<identifier> */
82501 +       char devicetype[32];    /* General class of device. */
82502 +};
82503 +
82504 +/* A xenbus driver. */
82505 +struct xenbus_driver {
82506 +       char *name;
82507 +       struct module *owner;
82508 +       const struct xenbus_device_id *ids;
82509 +       int (*probe)(struct xenbus_device *dev,
82510 +                    const struct xenbus_device_id *id);
82511 +       void (*otherend_changed)(struct xenbus_device *dev,
82512 +                                XenbusState backend_state);
82513 +       int (*remove)(struct xenbus_device *dev);
82514 +       int (*suspend)(struct xenbus_device *dev);
82515 +       int (*resume)(struct xenbus_device *dev);
82516 +       int (*uevent)(struct xenbus_device *, char **, int, char *, int);
82517 +       struct device_driver driver;
82518 +       int (*read_otherend_details)(struct xenbus_device *dev);
82519 +};
82520 +
82521 +static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
82522 +{
82523 +       return container_of(drv, struct xenbus_driver, driver);
82524 +}
82525 +
82526 +int xenbus_register_frontend(struct xenbus_driver *drv);
82527 +int xenbus_register_backend(struct xenbus_driver *drv);
82528 +void xenbus_unregister_driver(struct xenbus_driver *drv);
82529 +
82530 +typedef u32 xenbus_transaction_t;
82531 +
82532 +char **xenbus_directory(xenbus_transaction_t t,
82533 +                       const char *dir, const char *node, unsigned int *num);
82534 +void *xenbus_read(xenbus_transaction_t t,
82535 +                 const char *dir, const char *node, unsigned int *len);
82536 +int xenbus_write(xenbus_transaction_t t,
82537 +                const char *dir, const char *node, const char *string);
82538 +int xenbus_mkdir(xenbus_transaction_t t,
82539 +                const char *dir, const char *node);
82540 +int xenbus_exists(xenbus_transaction_t t,
82541 +                 const char *dir, const char *node);
82542 +int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node);
82543 +int xenbus_transaction_start(xenbus_transaction_t *t);
82544 +int xenbus_transaction_end(xenbus_transaction_t t, int abort);
82545 +
82546 +/* Single read and scanf: returns -errno or num scanned if > 0. */
82547 +int xenbus_scanf(xenbus_transaction_t t,
82548 +                const char *dir, const char *node, const char *fmt, ...)
82549 +       __attribute__((format(scanf, 4, 5)));
82550 +
82551 +/* Single printf and write: returns -errno or 0. */
82552 +int xenbus_printf(xenbus_transaction_t t,
82553 +                 const char *dir, const char *node, const char *fmt, ...)
82554 +       __attribute__((format(printf, 4, 5)));
82555 +
82556 +/* Generic read function: NULL-terminated triples of name,
82557 + * sprintf-style type string, and pointer. Returns 0 or errno.*/
82558 +int xenbus_gather(xenbus_transaction_t t, const char *dir, ...);
82559 +
82560 +/* notifer routines for when the xenstore comes up */
82561 +int register_xenstore_notifier(struct notifier_block *nb);
82562 +void unregister_xenstore_notifier(struct notifier_block *nb);
82563 +
82564 +int register_xenbus_watch(struct xenbus_watch *watch);
82565 +void unregister_xenbus_watch(struct xenbus_watch *watch);
82566 +void xs_suspend(void);
82567 +void xs_resume(void);
82568 +
82569 +/* Used by xenbus_dev to borrow kernel's store connection. */
82570 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
82571 +
82572 +/* Called from xen core code. */
82573 +void xenbus_suspend(void);
82574 +void xenbus_resume(void);
82575 +
82576 +#define XENBUS_IS_ERR_READ(str) ({                     \
82577 +       if (!IS_ERR(str) && strlen(str) == 0) {         \
82578 +               kfree(str);                             \
82579 +               str = ERR_PTR(-ERANGE);                 \
82580 +       }                                               \
82581 +       IS_ERR(str);                                    \
82582 +})
82583 +
82584 +#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
82585 +
82586 +
82587 +/**
82588 + * Register a watch on the given path, using the given xenbus_watch structure
82589 + * for storage, and the given callback function as the callback.  Return 0 on
82590 + * success, or -errno on error.  On success, the given path will be saved as
82591 + * watch->node, and remains the caller's to free.  On error, watch->node will
82592 + * be NULL, the device will switch to XenbusStateClosing, and the error will
82593 + * be saved in the store.
82594 + */
82595 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
82596 +                     struct xenbus_watch *watch,
82597 +                     void (*callback)(struct xenbus_watch *,
82598 +                                      const char **, unsigned int));
82599 +
82600 +
82601 +/**
82602 + * Register a watch on the given path/path2, using the given xenbus_watch
82603 + * structure for storage, and the given callback function as the callback.
82604 + * Return 0 on success, or -errno on error.  On success, the watched path
82605 + * (path/path2) will be saved as watch->node, and becomes the caller's to
82606 + * kfree().  On error, watch->node will be NULL, so the caller has nothing to
82607 + * free, the device will switch to XenbusStateClosing, and the error will be
82608 + * saved in the store.
82609 + */
82610 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
82611 +                      const char *path2, struct xenbus_watch *watch,
82612 +                      void (*callback)(struct xenbus_watch *,
82613 +                                       const char **, unsigned int));
82614 +
82615 +
82616 +/**
82617 + * Advertise in the store a change of the given driver to the given new_state.
82618 + * Return 0 on success, or -errno on error.  On error, the device will switch
82619 + * to XenbusStateClosing, and the error will be saved in the store.
82620 + */
82621 +int xenbus_switch_state(struct xenbus_device *dev, XenbusState new_state);
82622 +
82623 +
82624 +/**
82625 + * Grant access to the given ring_mfn to the peer of the given device.  Return
82626 + * 0 on success, or -errno on error.  On error, the device will switch to
82627 + * XenbusStateClosing, and the error will be saved in the store.
82628 + */
82629 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
82630 +
82631 +
82632 +/**
82633 + * Map a page of memory into this domain from another domain's grant table.
82634 + * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
82635 + * page to that address, and sets *vaddr to that address.
82636 + * xenbus_map_ring does not allocate the virtual address space (you must do
82637 + * this yourself!). It only maps in the page to the specified address.
82638 + * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
82639 + * or -ENOMEM on error. If an error is returned, device will switch to
82640 + * XenbusStateClosing and the error message will be saved in XenStore.
82641 + */
82642 +int xenbus_map_ring_valloc(struct xenbus_device *dev,
82643 +                          int gnt_ref, void **vaddr);
82644 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
82645 +                          grant_handle_t *handle, void *vaddr);
82646 +
82647 +
82648 +/**
82649 + * Unmap a page of memory in this domain that was imported from another domain.
82650 + * Use xenbus_unmap_ring_vfree if you mapped in your memory with
82651 + * xenbus_map_ring_valloc (it will free the virtual address space).
82652 + * Returns 0 on success and returns GNTST_* on error
82653 + * (see xen/include/interface/grant_table.h).
82654 + */
82655 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
82656 +int xenbus_unmap_ring(struct xenbus_device *dev,
82657 +                     grant_handle_t handle, void *vaddr);
82658 +
82659 +
82660 +/**
82661 + * Allocate an event channel for the given xenbus_device, assigning the newly
82662 + * created local port to *port.  Return 0 on success, or -errno on error.  On
82663 + * error, the device will switch to XenbusStateClosing, and the error will be
82664 + * saved in the store.
82665 + */
82666 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
82667 +
82668 +
82669 +/**
82670 + * Bind to an existing interdomain event channel in another domain. Returns 0
82671 + * on success and stores the local port in *port. On error, returns -errno,
82672 + * switches the device to XenbusStateClosing, and saves the error in XenStore.
82673 + */
82674 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
82675 +
82676 +
82677 +/**
82678 + * Free an existing event channel. Returns 0 on success or -errno on error.
82679 + */
82680 +int xenbus_free_evtchn(struct xenbus_device *dev, int port);
82681 +
82682 +
82683 +/**
82684 + * Return the state of the driver rooted at the given store path, or
82685 + * XenbusStateClosed if no state can be read.
82686 + */
82687 +XenbusState xenbus_read_driver_state(const char *path);
82688 +
82689 +
82690 +/***
82691 + * Report the given negative errno into the store, along with the given
82692 + * formatted message.
82693 + */
82694 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
82695 +                     ...);
82696 +
82697 +
82698 +/***
82699 + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
82700 + * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
82701 + * closedown of this driver and its peer.
82702 + */
82703 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
82704 +                     ...);
82705 +
82706 +
82707 +#endif /* _XEN_XENBUS_H */
82708 +
82709 +/*
82710 + * Local variables:
82711 + *  c-file-style: "linux"
82712 + *  indent-tabs-mode: t
82713 + *  c-indent-level: 8
82714 + *  c-basic-offset: 8
82715 + *  tab-width: 8
82716 + * End:
82717 + */
82718 diff -Nurp ref-linux-2.6.16.9/include/xen/xencons.h tmp-linux-2.6-xen.patch/include/xen/xencons.h
82719 --- ref-linux-2.6.16.9/include/xen/xencons.h    1970-01-01 01:00:00.000000000 +0100
82720 +++ tmp-linux-2.6-xen.patch/include/xen/xencons.h       2006-04-10 00:05:52.000000000 +0200
82721 @@ -0,0 +1,14 @@
82722 +#ifndef __ASM_XENCONS_H__
82723 +#define __ASM_XENCONS_H__
82724 +
82725 +void xencons_force_flush(void);
82726 +void xencons_resume(void);
82727 +
82728 +/* Interrupt work hooks. Receive data, or kick data out. */
82729 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
82730 +void xencons_tx(void);
82731 +
82732 +int xencons_ring_init(void);
82733 +int xencons_ring_send(const char *data, unsigned len);
82734 +
82735 +#endif /* __ASM_XENCONS_H__ */
82736 diff -Nurp ref-linux-2.6.16.9/include/xen/xen_proc.h tmp-linux-2.6-xen.patch/include/xen/xen_proc.h
82737 --- ref-linux-2.6.16.9/include/xen/xen_proc.h   1970-01-01 01:00:00.000000000 +0100
82738 +++ tmp-linux-2.6-xen.patch/include/xen/xen_proc.h      2006-04-10 00:05:52.000000000 +0200
82739 @@ -0,0 +1,23 @@
82740 +
82741 +#ifndef __ASM_XEN_PROC_H__
82742 +#define __ASM_XEN_PROC_H__
82743 +
82744 +#include <linux/config.h>
82745 +#include <linux/proc_fs.h>
82746 +
82747 +extern struct proc_dir_entry *create_xen_proc_entry(
82748 +       const char *name, mode_t mode);
82749 +extern void remove_xen_proc_entry(
82750 +       const char *name);
82751 +
82752 +#endif /* __ASM_XEN_PROC_H__ */
82753 +
82754 +/*
82755 + * Local variables:
82756 + *  c-file-style: "linux"
82757 + *  indent-tabs-mode: t
82758 + *  c-indent-level: 8
82759 + *  c-basic-offset: 8
82760 + *  tab-width: 8
82761 + * End:
82762 + */
82763 diff -Nurp ref-linux-2.6.16.9/kernel/Kconfig.preempt tmp-linux-2.6-xen.patch/kernel/Kconfig.preempt
82764 --- ref-linux-2.6.16.9/kernel/Kconfig.preempt   2006-04-19 08:10:14.000000000 +0200
82765 +++ tmp-linux-2.6-xen.patch/kernel/Kconfig.preempt      2006-04-10 00:05:52.000000000 +0200
82766 @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
82767  
82768  config PREEMPT
82769         bool "Preemptible Kernel (Low-Latency Desktop)"
82770 +       depends on !XEN
82771         help
82772           This option reduces the latency of the kernel by making
82773           all kernel code (that is not executing in a critical section)
82774 diff -Nurp ref-linux-2.6.16.9/lib/Makefile tmp-linux-2.6-xen.patch/lib/Makefile
82775 --- ref-linux-2.6.16.9/lib/Makefile     2006-04-19 08:10:14.000000000 +0200
82776 +++ tmp-linux-2.6-xen.patch/lib/Makefile        2006-04-10 00:05:52.000000000 +0200
82777 @@ -45,6 +45,9 @@ obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
82778  obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
82779  
82780  obj-$(CONFIG_SWIOTLB) += swiotlb.o
82781 +ifneq ($(CONFIG_IA64),y)
82782 +swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
82783 +endif
82784  
82785  hostprogs-y    := gen_crc32table
82786  clean-files    := crc32table.h
82787 diff -Nurp ref-linux-2.6.16.9/mm/highmem.c tmp-linux-2.6-xen.patch/mm/highmem.c
82788 --- ref-linux-2.6.16.9/mm/highmem.c     2006-04-19 08:10:14.000000000 +0200
82789 +++ tmp-linux-2.6-xen.patch/mm/highmem.c        2006-04-10 00:05:52.000000000 +0200
82790 @@ -152,6 +152,17 @@ start:
82791         return vaddr;
82792  }
82793  
82794 +#ifdef CONFIG_XEN
82795 +void kmap_flush_unused(void)
82796 +{
82797 +       spin_lock(&kmap_lock);
82798 +       flush_all_zero_pkmaps();
82799 +       spin_unlock(&kmap_lock);
82800 +}
82801 +
82802 +EXPORT_SYMBOL(kmap_flush_unused);
82803 +#endif
82804 +
82805  void fastcall *kmap_high(struct page *page)
82806  {
82807         unsigned long vaddr;
82808 diff -Nurp ref-linux-2.6.16.9/mm/Kconfig tmp-linux-2.6-xen.patch/mm/Kconfig
82809 --- ref-linux-2.6.16.9/mm/Kconfig       2006-04-19 08:10:14.000000000 +0200
82810 +++ tmp-linux-2.6-xen.patch/mm/Kconfig  2006-04-10 00:05:52.000000000 +0200
82811 @@ -126,11 +126,14 @@ comment "Memory hotplug is currently inc
82812  # Default to 4 for wider testing, though 8 might be more appropriate.
82813  # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
82814  # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
82815 +# XEN uses the mapping field on pagetable pages to store a pointer to
82816 +# the destructor.
82817  #
82818  config SPLIT_PTLOCK_CPUS
82819         int
82820         default "4096" if ARM && !CPU_CACHE_VIPT
82821         default "4096" if PARISC && !PA20
82822 +       default "4096" if XEN
82823         default "4"
82824  
82825  #
82826 diff -Nurp ref-linux-2.6.16.9/mm/memory.c tmp-linux-2.6-xen.patch/mm/memory.c
82827 --- ref-linux-2.6.16.9/mm/memory.c      2006-04-19 08:10:14.000000000 +0200
82828 +++ tmp-linux-2.6-xen.patch/mm/memory.c 2006-04-10 00:05:52.000000000 +0200
82829 @@ -405,7 +405,8 @@ struct page *vm_normal_page(struct vm_ar
82830          * Remove this test eventually!
82831          */
82832         if (unlikely(!pfn_valid(pfn))) {
82833 -               print_bad_pte(vma, pte, addr);
82834 +               if (!vma->vm_flags & VM_RESERVED)
82835 +                       print_bad_pte(vma, pte, addr);
82836                 return NULL;
82837         }
82838  
82839 @@ -1020,6 +1021,23 @@ int get_user_pages(struct task_struct *t
82840                         continue;
82841                 }
82842  
82843 +#ifdef CONFIG_XEN
82844 +               if (vma && (vma->vm_flags & VM_FOREIGN)) {
82845 +                       struct page **map = vma->vm_private_data;
82846 +                       int offset = (start - vma->vm_start) >> PAGE_SHIFT;
82847 +
82848 +                       if (map[offset] != NULL) {
82849 +                               if (pages)
82850 +                                       pages[i] = map[offset];
82851 +                               if (vmas)
82852 +                                       vmas[i] = vma;
82853 +                               i++;
82854 +                               start += PAGE_SIZE;
82855 +                               len--;
82856 +                               continue;
82857 +                       }
82858 +               }
82859 +#endif
82860                 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
82861                                 || !(vm_flags & vma->vm_flags))
82862                         return i ? : -EFAULT;
82863 @@ -1359,6 +1377,102 @@ int remap_pfn_range(struct vm_area_struc
82864  }
82865  EXPORT_SYMBOL(remap_pfn_range);
82866  
82867 +#ifdef CONFIG_XEN
82868 +static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
82869 +                                    unsigned long addr, unsigned long end,
82870 +                                    pte_fn_t fn, void *data)
82871 +{
82872 +       pte_t *pte;
82873 +       int err;
82874 +       struct page *pmd_page;
82875 +       spinlock_t *ptl;
82876 +
82877 +       pte = (mm == &init_mm) ?
82878 +               pte_alloc_kernel(pmd, addr) :
82879 +               pte_alloc_map_lock(mm, pmd, addr, &ptl);
82880 +       if (!pte)
82881 +               return -ENOMEM;
82882 +
82883 +       BUG_ON(pmd_huge(*pmd));
82884 +
82885 +       pmd_page = pmd_page(*pmd);
82886 +
82887 +       do {
82888 +               err = fn(pte, pmd_page, addr, data);
82889 +               if (err)
82890 +                       break;
82891 +       } while (pte++, addr += PAGE_SIZE, addr != end);
82892 +
82893 +       if (mm != &init_mm)
82894 +               pte_unmap_unlock(pte-1, ptl);
82895 +       return err;
82896 +}
82897 +
82898 +static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
82899 +                                    unsigned long addr, unsigned long end,
82900 +                                    pte_fn_t fn, void *data)
82901 +{
82902 +       pmd_t *pmd;
82903 +       unsigned long next;
82904 +       int err;
82905 +
82906 +       pmd = pmd_alloc(mm, pud, addr);
82907 +       if (!pmd)
82908 +               return -ENOMEM;
82909 +       do {
82910 +               next = pmd_addr_end(addr, end);
82911 +               err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
82912 +               if (err)
82913 +                       break;
82914 +       } while (pmd++, addr = next, addr != end);
82915 +       return err;
82916 +}
82917 +
82918 +static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
82919 +                                    unsigned long addr, unsigned long end,
82920 +                                    pte_fn_t fn, void *data)
82921 +{
82922 +       pud_t *pud;
82923 +       unsigned long next;
82924 +       int err;
82925 +
82926 +       pud = pud_alloc(mm, pgd, addr);
82927 +       if (!pud)
82928 +               return -ENOMEM;
82929 +       do {
82930 +               next = pud_addr_end(addr, end);
82931 +               err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
82932 +               if (err)
82933 +                       break;
82934 +       } while (pud++, addr = next, addr != end);
82935 +       return err;
82936 +}
82937 +
82938 +/*
82939 + * Scan a region of virtual memory, filling in page tables as necessary
82940 + * and calling a provided function on each leaf page table.
82941 + */
82942 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
82943 +                       unsigned long size, pte_fn_t fn, void *data)
82944 +{
82945 +       pgd_t *pgd;
82946 +       unsigned long next;
82947 +       unsigned long end = addr + size;
82948 +       int err;
82949 +
82950 +       BUG_ON(addr >= end);
82951 +       pgd = pgd_offset(mm, addr);
82952 +       do {
82953 +               next = pgd_addr_end(addr, end);
82954 +               err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
82955 +               if (err)
82956 +                       break;
82957 +       } while (pgd++, addr = next, addr != end);
82958 +       return err;
82959 +}
82960 +EXPORT_SYMBOL_GPL(apply_to_page_range);
82961 +#endif
82962 +
82963  /*
82964   * handle_pte_fault chooses page fault handler according to an entry
82965   * which was read non-atomically.  Before making any commitment, on
82966 diff -Nurp ref-linux-2.6.16.9/mm/mmap.c tmp-linux-2.6-xen.patch/mm/mmap.c
82967 --- ref-linux-2.6.16.9/mm/mmap.c        2006-04-19 08:10:14.000000000 +0200
82968 +++ tmp-linux-2.6-xen.patch/mm/mmap.c   2006-04-10 00:05:52.000000000 +0200
82969 @@ -1937,6 +1937,10 @@ void exit_mmap(struct mm_struct *mm)
82970         unsigned long nr_accounted = 0;
82971         unsigned long end;
82972  
82973 +#ifdef arch_exit_mmap
82974 +       arch_exit_mmap(mm);
82975 +#endif
82976 +
82977         lru_add_drain();
82978         flush_cache_mm(mm);
82979         tlb = tlb_gather_mmu(mm, 1);
82980 diff -Nurp ref-linux-2.6.16.9/mm/page_alloc.c tmp-linux-2.6-xen.patch/mm/page_alloc.c
82981 --- ref-linux-2.6.16.9/mm/page_alloc.c  2006-04-19 08:10:14.000000000 +0200
82982 +++ tmp-linux-2.6-xen.patch/mm/page_alloc.c     2006-04-10 00:05:52.000000000 +0200
82983 @@ -153,8 +153,7 @@ static void bad_page(struct page *page)
82984                         1 << PG_reclaim |
82985                         1 << PG_slab    |
82986                         1 << PG_swapcache |
82987 -                       1 << PG_writeback |
82988 -                       1 << PG_buddy );
82989 +                       1 << PG_writeback );
82990         set_page_count(page, 0);
82991         reset_page_mapcount(page);
82992         page->mapping = NULL;
82993 @@ -225,12 +224,12 @@ static inline unsigned long page_order(s
82994  
82995  static inline void set_page_order(struct page *page, int order) {
82996         set_page_private(page, order);
82997 -       __SetPageBuddy(page);
82998 +       __SetPagePrivate(page);
82999  }
83000  
83001  static inline void rmv_page_order(struct page *page)
83002  {
83003 -       __ClearPageBuddy(page);
83004 +       __ClearPagePrivate(page);
83005         set_page_private(page, 0);
83006  }
83007  
83008 @@ -269,13 +268,11 @@ __find_combined_index(unsigned long page
83009   * This function checks whether a page is free && is the buddy
83010   * we can do coalesce a page and its buddy if
83011   * (a) the buddy is not in a hole &&
83012 - * (b) the buddy is in the buddy system &&
83013 - * (c) a page and its buddy have the same order.
83014 - *
83015 - * For recording whether a page is in the buddy system, we use PG_buddy.
83016 - * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
83017 + * (b) the buddy is free &&
83018 + * (c) the buddy is on the buddy system &&
83019 + * (d) a page and its buddy have the same order.
83020 + * for recording page's order, we use page_private(page) and PG_private.
83021   *
83022 - * For recording page's order, we use page_private(page).
83023   */
83024  static inline int page_is_buddy(struct page *page, int order)
83025  {
83026 @@ -284,10 +281,10 @@ static inline int page_is_buddy(struct p
83027                 return 0;
83028  #endif
83029  
83030 -       if (PageBuddy(page) && page_order(page) == order) {
83031 -               BUG_ON(page_count(page) != 0);
83032 +       if (PagePrivate(page)           &&
83033 +           (page_order(page) == order) &&
83034 +            page_count(page) == 0)
83035                 return 1;
83036 -       }
83037         return 0;
83038  }
83039  
83040 @@ -304,7 +301,7 @@ static inline int page_is_buddy(struct p
83041   * as necessary, plus some accounting needed to play nicely with other
83042   * parts of the VM system.
83043   * At each level, we keep a list of pages, which are heads of continuous
83044 - * free pages of length of (1 << order) and marked with PG_buddy. Page's
83045 + * free pages of length of (1 << order) and marked with PG_Private.Page's
83046   * order is recorded in page_private(page) field.
83047   * So when we are allocating or freeing one, we can derive the state of the
83048   * other.  That is, if we allocate a small block, and both were   
83049 @@ -367,8 +364,7 @@ static inline int free_pages_check(struc
83050                         1 << PG_slab    |
83051                         1 << PG_swapcache |
83052                         1 << PG_writeback |
83053 -                       1 << PG_reserved |
83054 -                       1 << PG_buddy ))))
83055 +                       1 << PG_reserved ))))
83056                 bad_page(page);
83057         if (PageDirty(page))
83058                 __ClearPageDirty(page);
83059 @@ -422,7 +418,8 @@ static void __free_pages_ok(struct page 
83060         int i;
83061         int reserved = 0;
83062  
83063 -       arch_free_page(page, order);
83064 +       if (arch_free_page(page, order))
83065 +               return;
83066         if (!PageHighMem(page))
83067                 mutex_debug_check_no_locks_freed(page_address(page),
83068                                                  PAGE_SIZE<<order);
83069 @@ -526,8 +523,7 @@ static int prep_new_page(struct page *pa
83070                         1 << PG_slab    |
83071                         1 << PG_swapcache |
83072                         1 << PG_writeback |
83073 -                       1 << PG_reserved |
83074 -                       1 << PG_buddy ))))
83075 +                       1 << PG_reserved ))))
83076                 bad_page(page);
83077  
83078         /*
83079 @@ -716,7 +712,8 @@ static void fastcall free_hot_cold_page(
83080         struct per_cpu_pages *pcp;
83081         unsigned long flags;
83082  
83083 -       arch_free_page(page, 0);
83084 +       if (arch_free_page(page, 0))
83085 +               return;
83086  
83087         if (PageAnon(page))
83088                 page->mapping = NULL;
83089 diff -Nurp ref-linux-2.6.16.9/net/core/dev.c tmp-linux-2.6-xen.patch/net/core/dev.c
83090 --- ref-linux-2.6.16.9/net/core/dev.c   2006-04-19 08:10:14.000000000 +0200
83091 +++ tmp-linux-2.6-xen.patch/net/core/dev.c      2006-04-10 00:05:52.000000000 +0200
83092 @@ -116,6 +116,12 @@
83093  #endif /* CONFIG_NET_RADIO */
83094  #include <asm/current.h>
83095  
83096 +#ifdef CONFIG_XEN
83097 +#include <net/ip.h>
83098 +#include <linux/tcp.h>
83099 +#include <linux/udp.h>
83100 +#endif
83101 +
83102  /*
83103   *     The list of packet types we will receive (as opposed to discard)
83104   *     and the routines to invoke.
83105 @@ -1260,6 +1266,37 @@ int dev_queue_xmit(struct sk_buff *skb)
83106             __skb_linearize(skb, GFP_ATOMIC))
83107                 goto out_kfree_skb;
83108  
83109 +#ifdef CONFIG_XEN
83110 +       /* If a checksum-deferred packet is forwarded to a device that needs a
83111 +        * checksum, correct the pointers and force checksumming.
83112 +        */
83113 +       if (skb->proto_csum_blank) {
83114 +               if (skb->protocol != htons(ETH_P_IP))
83115 +                       goto out_kfree_skb;
83116 +               skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
83117 +               if (skb->h.raw >= skb->tail)
83118 +                       goto out_kfree_skb;
83119 +               switch (skb->nh.iph->protocol) {
83120 +               case IPPROTO_TCP:
83121 +                       skb->csum = offsetof(struct tcphdr, check);
83122 +                       break;
83123 +               case IPPROTO_UDP:
83124 +                       skb->csum = offsetof(struct udphdr, check);
83125 +                       break;
83126 +               default:
83127 +                       if (net_ratelimit())
83128 +                               printk(KERN_ERR "Attempting to checksum a non-"
83129 +                                      "TCP/UDP packet, dropping a protocol"
83130 +                                      " %d packet", skb->nh.iph->protocol);
83131 +                       rc = -EPROTO;
83132 +                       goto out_kfree_skb;
83133 +               }
83134 +               if ((skb->h.raw + skb->csum + 2) > skb->tail)
83135 +                       goto out_kfree_skb;
83136 +               skb->ip_summed = CHECKSUM_HW;
83137 +       }
83138 +#endif
83139 +
83140         /* If packet is not checksummed and device does not support
83141          * checksumming for this protocol, complete checksumming here.
83142          */
83143 @@ -1609,6 +1646,19 @@ int netif_receive_skb(struct sk_buff *sk
83144         }
83145  #endif
83146  
83147 +#ifdef CONFIG_XEN
83148 +       switch (skb->ip_summed) {
83149 +       case CHECKSUM_UNNECESSARY:
83150 +               skb->proto_data_valid = 1;
83151 +               break;
83152 +       case CHECKSUM_HW:
83153 +               /* XXX Implement me. */
83154 +       default:
83155 +               skb->proto_data_valid = 0;
83156 +               break;
83157 +       }
83158 +#endif
83159 +
83160         list_for_each_entry_rcu(ptype, &ptype_all, list) {
83161                 if (!ptype->dev || ptype->dev == skb->dev) {
83162                         if (pt_prev) 
83163 diff -Nurp ref-linux-2.6.16.9/net/core/skbuff.c tmp-linux-2.6-xen.patch/net/core/skbuff.c
83164 --- ref-linux-2.6.16.9/net/core/skbuff.c        2006-04-19 08:10:14.000000000 +0200
83165 +++ tmp-linux-2.6-xen.patch/net/core/skbuff.c   2006-04-10 00:05:52.000000000 +0200
83166 @@ -132,6 +132,7 @@ void skb_under_panic(struct sk_buff *skb
83167   *     Buffers may only be allocated from interrupts using a @gfp_mask of
83168   *     %GFP_ATOMIC.
83169   */
83170 +#ifndef CONFIG_HAVE_ARCH_ALLOC_SKB
83171  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
83172                             int fclone)
83173  {
83174 @@ -186,6 +187,7 @@ nodata:
83175         skb = NULL;
83176         goto out;
83177  }
83178 +#endif /* !CONFIG_HAVE_ARCH_ALLOC_SKB */
83179  
83180  /**
83181   *     alloc_skb_from_cache    -       allocate a network buffer
83182 @@ -203,14 +205,18 @@ nodata:
83183   */
83184  struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
83185                                      unsigned int size,
83186 -                                    gfp_t gfp_mask)
83187 +                                    gfp_t gfp_mask,
83188 +                                    int fclone)
83189  {
83190 +       kmem_cache_t *cache;
83191 +       struct skb_shared_info *shinfo;
83192         struct sk_buff *skb;
83193         u8 *data;
83194  
83195 +       cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
83196 +
83197         /* Get the HEAD */
83198 -       skb = kmem_cache_alloc(skbuff_head_cache,
83199 -                              gfp_mask & ~__GFP_DMA);
83200 +       skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
83201         if (!skb)
83202                 goto out;
83203  
83204 @@ -227,16 +233,29 @@ struct sk_buff *alloc_skb_from_cache(kme
83205         skb->data = data;
83206         skb->tail = data;
83207         skb->end  = data + size;
83208 +       /* make sure we initialize shinfo sequentially */
83209 +       shinfo = skb_shinfo(skb);
83210 +       atomic_set(&shinfo->dataref, 1);
83211 +       shinfo->nr_frags  = 0;
83212 +       shinfo->tso_size = 0;
83213 +       shinfo->tso_segs = 0;
83214 +       shinfo->ufo_size = 0;
83215 +       shinfo->ip6_frag_id = 0;
83216 +       shinfo->frag_list = NULL;
83217  
83218 -       atomic_set(&(skb_shinfo(skb)->dataref), 1);
83219 -       skb_shinfo(skb)->nr_frags  = 0;
83220 -       skb_shinfo(skb)->tso_size = 0;
83221 -       skb_shinfo(skb)->tso_segs = 0;
83222 -       skb_shinfo(skb)->frag_list = NULL;
83223 +       if (fclone) {
83224 +               struct sk_buff *child = skb + 1;
83225 +               atomic_t *fclone_ref = (atomic_t *) (child + 1);
83226 +
83227 +               skb->fclone = SKB_FCLONE_ORIG;
83228 +               atomic_set(fclone_ref, 1);
83229 +
83230 +               child->fclone = SKB_FCLONE_UNAVAILABLE;
83231 +       }
83232  out:
83233         return skb;
83234  nodata:
83235 -       kmem_cache_free(skbuff_head_cache, skb);
83236 +       kmem_cache_free(cache, skb);
83237         skb = NULL;
83238         goto out;
83239  }
83240 @@ -408,6 +427,10 @@ struct sk_buff *skb_clone(struct sk_buff
83241         C(local_df);
83242         n->cloned = 1;
83243         n->nohdr = 0;
83244 +#ifdef CONFIG_XEN
83245 +       C(proto_data_valid);
83246 +       C(proto_csum_blank);
83247 +#endif
83248         C(pkt_type);
83249         C(ip_summed);
83250         C(priority);
83251 diff -Nurp ref-linux-2.6.16.9/scripts/Makefile.xen tmp-linux-2.6-xen.patch/scripts/Makefile.xen
83252 --- ref-linux-2.6.16.9/scripts/Makefile.xen     1970-01-01 01:00:00.000000000 +0100
83253 +++ tmp-linux-2.6-xen.patch/scripts/Makefile.xen        2006-04-10 00:05:52.000000000 +0200
83254 @@ -0,0 +1,14 @@
83255 +
83256 +# cherrypickxen($1 = allobj)
83257 +cherrypickxen = $(foreach var, $(1), \
83258 +               $(shell o=$(var); \
83259 +                       c=$${o/%.o/-xen.c}; \
83260 +                       s=$${o/%.o/-xen.S}; \
83261 +                       oxen=$${o/%.o/-xen.o}; \
83262 +                       [ -f $(srctree)/$(src)/$${c} ] || \
83263 +                          [ -f $(srctree)/$(src)/$${s} ] \
83264 +                               && echo $$oxen \
83265 +                               || echo $(var) ) \
83266 +         )
83267 +# filterxen($1 = allobj, $2 = noobjs)
83268 +filterxen = $(filter-out $(2), $(1))
This page took 7.010989 seconds and 3 git commands to generate.